Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 176 additions & 0 deletions pocs/linux/kernelctf/CVE-2024-58239_mitigation/docs/exploit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Vulnerability

In function `tls_sw_recvmsg`, if control != TLS_RECORD_TYPE_DATA, it will continue to recv more packets.
```C
/* Process pending decrypted records. It must be non-zero-copy */
err = process_rx_list(ctx, msg, &control, 0, len, is_peek);
if (err < 0)
goto end;

copied = err;
if (len <= copied)
goto end;
```

And then when new control is TLS_RECORD_TYPE_DATA and its perform zero copy, new control different than tlm->control, function `tls_record_content_type` will return `0`

```C
static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm,
u8 *control)
{
int err;

if (!*control) {
*control = tlm->control;
if (!*control)
return -EBADMSG;

err = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE,
sizeof(*control), control);
if (*control != TLS_RECORD_TYPE_DATA) {
if (err || msg->msg_flags & MSG_CTRUNC)
return -EIO;
}
} else if (*control != tlm->control) {
return 0;
}

return 1;
}

```
Later, it will queue `darg.skb` (which is `strp->anchor`) while darg.zc==1 into rx_list.
```C
err = tls_record_content_type(msg, tls_msg(darg.skb), &control);
if (err <= 0) {
DEBUG_NET_WARN_ON_ONCE(darg.zc);
tls_rx_rec_done(ctx);
put_on_rx_list_err:
__skb_queue_tail(&ctx->rx_list, darg.skb);
goto recv_end;
}
```

When darg.zc == 1, its forbidden queue it into rx_list, otherwise it will lead to `strp->anchor->frag_list` and `strp->anchor` refcount issue and cause UAF.

# Exploit

Using this vulnerability we can add `strp->anchor` to the `rx_list` queue. If we call recvmsg on TLS socket, it will call `process_rx_list` on the kernel side and process the `strp->anchor` skb. Let see how this `strp->anchor` used when TCP data come and after tls decryption at `tls_sw_recvmsg`.

```c
static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
{
struct tcp_sock *tp = tcp_sk(strp->sk);
struct sk_buff *first;
u32 offset;

first = tcp_recv_skb(strp->sk, tp->copied_seq, &offset);
if (WARN_ON_ONCE(!first))
return;

/* Bestow the state onto the anchor */
strp->anchor->len = offset + len;
strp->anchor->data_len = offset + len;
strp->anchor->truesize = offset + len;

skb_shinfo(strp->anchor)->frag_list = first;

skb_copy_header(strp->anchor, first);
strp->anchor->destructor = NULL;

strp->stm.offset = offset;
}
```

When TCP receive data, this function `tls_strp_load_anchor_with_queue` will call from `tls_strp_read_sock`. The socket buffer that comes from TCP will stored at `skb_shinfo(strp->anchor)->frag_list` so the TLS decryption have information about the SKB that want to process from just `strp->anchor.

In `tls_sw_recvmsg` after decryption has been done, code doesn't clear `strp->anchor`.
```c
void tls_strp_msg_done(struct tls_strparser *strp)
{
WARN_ON(!strp->stm.full_len);

if (likely(!strp->copy_mode))
tcp_read_done(strp->sk, strp->stm.full_len);
else
tls_strp_flush_anchor_copy(strp);

WRITE_ONCE(strp->msg_ready, 0);
memset(&strp->stm, 0, sizeof(strp->stm));

tls_strp_check_rcv(strp);
}
```
so `skb_shinfo(strp->anchor)->frag_list` remain contain old data that point to freed TCP socket buffer.

Our idea is to access freed pages from skb data via splice syscall. In TCP we can perform zero-copy using splice by just splicing pipe data to the established TCP socket. And it will reach this code.
```c
} else if (zc == MSG_SPLICE_PAGES) {
/* Splice in data if we can; copy if we can't. */
if (tcp_downgrade_zcopy_pure(sk, skb))
goto wait_for_space;
copy = tcp_wmem_schedule(sk, copy);
if (!copy)
goto wait_for_space;

err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
sk->sk_allocation);
if (err < 0) {
if (err == -EMSGSIZE) {
tcp_mark_push(tp, skb);
goto new_segment;
}
goto do_error;
}
copy = err;

if (!(flags & MSG_NO_SHARED_FRAGS))
skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;

sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
}
```
So in this case if we splice pipe to the tcp socket, `msg->msg_iter` comes from pipe and then `skb_splice_from_iter` will just copy the pipe pages to the `skb_shinfo(skb)->frags`. Because we using loopback interface, the kernel will just clone the skb and arrived to the other end of TCP socket with our spliced pipe page contained inside the skb.

Then we will have a scenario where we have `strp->anchor` is in `rx_list` queue, contain `skb_shinfo(strp->anchor)->frag_list` to the freed TCP socket buffer, and the TCP socket buffer also contain freed pipe page.

To exploit it further, we will call `splice` to the TLS socket contain this freed pipe page. It will call `tls_sw_splice_read` and will install the freed page to the our pipe. The freed page have a `page->count` 0, and `skb_splice_bits` will just take that page and increase the `page->count` even though this page already freed.
```c
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
static bool spd_fill_page(struct splice_pipe_desc *spd,
struct pipe_inode_info *pipe, struct page *page,
unsigned int *len, unsigned int offset,
bool linear,
struct sock *sk)
{
if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
return true;

if (linear) {
page = linear_to_page(page, len, &offset, sk);
if (!page)
return true;
}
if (spd_can_coalesce(spd, page, offset)) {
spd->partial[spd->nr_pages - 1].len += *len;
return false;
}
get_page(page); // install freed page to pipe
spd->pages[spd->nr_pages] = page;
spd->partial[spd->nr_pages].len = *len;
spd->partial[spd->nr_pages].offset = offset;
spd->nr_pages++;

return false;
}
```

So now our pipe holds the freed page that has `page->count` equal to 1. But remember, this page is already freed and it is actually still included in the page freelists. But we still can't write to this page, because this page was installed to the pipe with `nosteal_pipe_buf_ops` and without `PIPE_BUF_FLAG_CAN_MERGE`.

If we do a page spray with order 0 (by writing to other spray pipes), another pipe can also allocate this page, so we have two pipes pointing to the same page that has a refcount of 1. If we release the page from the first pipe (just by `read()`-ing from it), the page will return to the freelist, but the second pipe will still hold a reference to the freed page. Furthermore, the second pipe will hold the freed page with `anon_pipe_buf_ops` and `PIPE_BUF_FLAG_CAN_MERGE` set, so we can write to the freed page.

Next we reclaim this freed page (still held by the second pipe) as a PTE page table. By reading from an anonymous mmaped memory region, the kernel will allocate a PTE page table and install the `empty_zero_page` PTE to the page table. `empty_zero_page` is located in the kernel data. By reading from the pipe that now holds the reclaimed PTE page table, we can read the `empty_zero_page` PTE. We can then calculate the physical address of any kernel target we want to write to. In this case we are targeting `core_pattern`. We calculate the `core_pattern` PTE with RW flags and write it to the page table via a pipe write. Then we just write to the anon mmaped memory because its PTE now points to the `core_pattern` page. After overwriting `core_pattern` we just crash a child process to make the kernel run our program as root to read the flag.

Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from Crypto.Cipher import AES
import struct

def generate_tls_record(plaintext, key, salt, rec_seq_int, content_type=b'\x17'):
"""
Generates a raw TLS 1.2 record with AES-CCM-128 encryption.

Args:
plaintext (bytes): The data to encrypt.
key (bytes): The 128-bit (16-byte) encryption key.
salt (bytes): The 4-byte salt.
rec_seq_int (int): The record sequence number as an integer.
content_type (bytes, optional): The TLS record content type.
Defaults to b'\x17' (Application Data).

Returns:
bytes: The raw TLS 1.2 record.
"""
# Convert the integer sequence number to an 8-byte big-endian byte string
rec_seq_bytes = rec_seq_int.to_bytes(8, 'big')

# TLS protocol version for TLS 1.2
tls_version = b'\x03\x03'

# The 12-byte nonce is the 4-byte salt plus the 8-byte explicit sequence number
nonce = salt + rec_seq_bytes

# Construct the Additional Authenticated Data (AAD) for integrity protection
# AAD = seq_num + TLSCompressed.type + TLSCompressed.version + TLSCompressed.length
aad = rec_seq_bytes + content_type + tls_version + struct.pack('!H', len(plaintext))

# Initialize AES-CCM cipher with a 16-byte (128-bit) authentication tag
cipher = AES.new(key, AES.MODE_CCM, nonce=nonce, mac_len=16)

# Provide the AAD to the cipher
cipher.update(aad)

# Encrypt the plaintext and get the authentication tag
ciphertext, tag = cipher.encrypt_and_digest(plaintext)

# The encrypted payload for AEAD ciphers in TLS is: nonce_explicit + aead_ciphertext
encrypted_payload = rec_seq_bytes + ciphertext + tag

# Construct the 5-byte TLS record header: Type (1) + Version (2) + Length (2)
record_header = content_type + tls_version + struct.pack('!H', len(encrypted_payload))

# The final raw TLS record to be sent
raw_tls_record = record_header + encrypted_payload

return raw_tls_record

def format_as_c_array(data, var_name="tls_record"):
"""Formats a bytes object into a C-style unsigned char array."""
hex_values = [f"0x{byte:02x}" for byte in data]
c_array = f"unsigned char {var_name}[] = {{\n "

for i in range(0, len(hex_values), 12):
line = ", ".join(hex_values[i:i+12])
c_array += line
if i + 12 < len(hex_values):
c_array += ",\n "

c_array += "\n};\n"
c_array += f"unsigned int {var_name}_len = sizeof({var_name});"

return c_array

if __name__ == '__main__':
# Static parameters
key = b'\x00' * 16
salt = b'\x00' * 4

# Define an array of plaintexts to send (already as bytes)
plaintexts = [
b"Hello world",
b"Hello world",
b"Hello world"
]
content_types = [b"\x17", b"\x16", b"\x17"]

# Initialize the record sequence number
current_sequence_number = 0

# Loop through plaintexts and generate records
for i, plaintext_bytes in enumerate(plaintexts):

# Generate the TLS record. We don't need to pass content_type
# as we are using the default value (0x17).
tls_record = generate_tls_record(
plaintext_bytes,
key,
salt,
current_sequence_number,
content_types[i]
)

# Format the output as a unique C array
c_array_output = format_as_c_array(tls_record, f"tls_record_{i+1}")

# We decode the plaintext bytes here only for the comment generation
print(f"/* MESSAGE {i+1}: Raw TLS 1.2 record for plaintext: '{plaintext_bytes.decode()}' */")
print(f"/* Sequence Number: {current_sequence_number} */")
print(f"/* Content Type: 0x{b'\x17'.hex()} (Application Data) */")
print(f"/* Total length: {len(tls_record)} bytes */")
print(c_array_output)
print("\n" + "="*50 + "\n")

# CRITICAL: Increment the sequence number for the next message
current_sequence_number += 1

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- Requirements:
- Capabilities:
- Kernel configuration: CONFIG_TLS
- User namespaces required: No
- Introduced by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=692d7b5d1f9125a1cf0595e979e3b5fb7210547e
- Fixed by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=fdfbaec5923d9359698cbb286bc0deadbb717504
- Affected Version: v5.0 - v6.7
- Affected Component: net/tls
- Cause: Use-After-Free
- Syscall to disable:
- URL: https://cve.mitre.org/cgi-bin/cvename.cgi?name=2024-58239
- Description: A use-after-free vulnerability in the Linux kernel's net/tls
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
all: exploit
exploit: exploit.c
gcc -static -o exploit exploit.c
Binary file not shown.
Loading
Loading