Skip to content

Commit 5bf706b

Browse files
lileclaude
andcommitted
fix(jailer): replace hardcoded FD limits with dynamic closure strategies
The FD cleanup in pre_exec used hardcoded upper bounds (1024 on Linux, 4096 on macOS). On systems with raised ulimit -n, FDs above these limits leaked into jailed processes, potentially exposing credentials, database connections, or network sockets. Replace with a 3-strategy cascade (Linux): 1. close_range(first_fd, ~0U, 0) — O(1), Linux 5.9+ 2. /proc/self/fd enumeration via raw getdents64 — no heap allocation 3. Brute-force close with dynamic limit from getrlimit(RLIMIT_NOFILE) macOS uses brute-force with dynamic getrlimit limit (replaces hardcoded 4096). All operations remain async-signal-safe for the pre_exec context. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent da71624 commit 5bf706b

File tree

1 file changed

+257
-15
lines changed
  • src/boxlite/src/jailer/common

1 file changed

+257
-15
lines changed

src/boxlite/src/jailer/common/fd.rs

Lines changed: 257 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,160 @@
66
//!
77
//! Only the async-signal-safe `close_inherited_fds_raw()` is used,
88
//! called from the `pre_exec` hook before exec().
9+
//!
10+
//! ## Strategy (in order of preference)
11+
//!
12+
//! 1. **Linux 5.9+**: `close_range(first_fd, ~0U, 0)` — O(1), kernel closes all.
13+
//! 2. **Linux < 5.9**: Enumerate `/proc/self/fd` via raw `getdents64` syscall
14+
//! (no memory allocation) and close only open FDs. Falls back to brute-force
15+
//! with a dynamic upper bound from `getrlimit(RLIMIT_NOFILE)`.
16+
//! 3. **macOS**: Brute-force close with dynamic upper bound from
17+
//! `getrlimit(RLIMIT_NOFILE)` instead of a hardcoded limit.
18+
19+
/// Query the soft limit for `RLIMIT_NOFILE`. Async-signal-safe.
20+
///
21+
/// Returns the soft limit (current max open files), or a safe default
22+
/// if the query fails. `getrlimit` is async-signal-safe per POSIX.
23+
///
24+
/// Note: Cannot reuse `rlimit::get_rlimit()` because it returns `io::Error`
25+
/// which heap-allocates (not async-signal-safe for pre_exec context).
26+
fn get_max_fd() -> i32 {
27+
let mut rlim = libc::rlimit {
28+
rlim_cur: 0,
29+
rlim_max: 0,
30+
};
31+
// SAFETY: getrlimit is async-signal-safe per POSIX. rlim is a valid stack-allocated struct.
32+
let result = unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) };
33+
if result == 0 && rlim.rlim_cur > 0 {
34+
if rlim.rlim_cur < i32::MAX as u64 {
35+
rlim.rlim_cur as i32
36+
} else {
37+
// rlim_cur is RLIM_INFINITY or very large; cap to a safe maximum.
38+
// 1048576 (2^20) matches Linux's default /proc/sys/fs/nr_open.
39+
1_048_576
40+
}
41+
} else {
42+
// getrlimit failed or returned 0; safe default per POSIX OPEN_MAX
43+
1024
44+
}
45+
}
46+
47+
/// Close open FDs by enumerating `/proc/self/fd` with raw `getdents64`.
48+
/// Async-signal-safe: uses only stack buffers and raw syscalls.
49+
///
50+
/// Returns `true` if `/proc/self/fd` was successfully enumerated,
51+
/// `false` if it's unavailable (e.g., mount namespace without /proc).
52+
#[cfg(target_os = "linux")]
53+
fn close_fds_via_proc(first_fd: i32) -> bool {
54+
// Open /proc/self/fd with raw syscall (no allocation)
55+
let proc_path = b"/proc/self/fd\0";
56+
// SAFETY: proc_path is a valid null-terminated string literal. open() is async-signal-safe.
57+
let dir_fd = unsafe {
58+
libc::open(
59+
proc_path.as_ptr().cast::<libc::c_char>(),
60+
libc::O_RDONLY | libc::O_DIRECTORY | libc::O_CLOEXEC,
61+
)
62+
};
63+
if dir_fd < 0 {
64+
return false;
65+
}
66+
67+
// Stack buffer for getdents64 — 1024 bytes handles ~40 entries per call,
68+
// sufficient for typical processes without heap allocation.
69+
let mut buf = [0u8; 1024];
70+
71+
loop {
72+
let nread = unsafe {
73+
libc::syscall(
74+
libc::SYS_getdents64,
75+
dir_fd,
76+
buf.as_mut_ptr(),
77+
buf.len() as libc::c_uint,
78+
)
79+
};
80+
81+
if nread <= 0 {
82+
break;
83+
}
84+
85+
let mut offset = 0usize;
86+
while offset < nread as usize {
87+
// SAFETY: getdents64 returns packed linux_dirent64 structs.
88+
// d_reclen is at byte offset 16. Use read_unaligned because the
89+
// buffer is a byte array and the u16 field may not be 2-byte aligned.
90+
let d_reclen =
91+
unsafe { buf.as_ptr().add(offset + 16).cast::<u16>().read_unaligned() } as usize;
92+
93+
if d_reclen == 0 || offset + d_reclen > nread as usize {
94+
break;
95+
}
96+
97+
// d_name starts at offset 19 in linux_dirent64
98+
let name_ptr = unsafe { buf.as_ptr().add(offset + 19) };
99+
100+
// Parse FD number from d_name (decimal string, null-terminated).
101+
// Async-signal-safe: no allocation, just pointer arithmetic.
102+
let fd = parse_fd_from_name(name_ptr);
103+
104+
if let Some(fd) = fd
105+
&& fd >= first_fd
106+
&& fd != dir_fd
107+
{
108+
// SAFETY: close() is async-signal-safe. Closing an already-closed FD is harmless.
109+
unsafe { libc::close(fd) };
110+
}
111+
112+
offset += d_reclen;
113+
}
114+
}
115+
116+
// SAFETY: close() is async-signal-safe. dir_fd was opened by us above.
117+
unsafe { libc::close(dir_fd) };
118+
true
119+
}
120+
121+
/// Parse a decimal FD number from a null-terminated C string pointer.
122+
/// Returns `None` for non-numeric entries (e.g., "." and "..").
123+
/// Async-signal-safe: no allocation, pure arithmetic.
124+
///
125+
/// # Safety
126+
///
127+
/// `ptr` must point to a valid, null-terminated byte string in readable memory.
128+
/// Called only from `close_fds_via_proc` with pointers into the `getdents64` buffer,
129+
/// which the kernel guarantees to contain null-terminated `d_name` fields.
130+
#[cfg(target_os = "linux")]
131+
fn parse_fd_from_name(ptr: *const u8) -> Option<i32> {
132+
let mut fd: i32 = 0;
133+
let mut i = 0usize;
134+
loop {
135+
// SAFETY: ptr is guaranteed by the caller to point to a null-terminated string.
136+
// We stop reading at the null terminator (byte == 0).
137+
let byte = unsafe { *ptr.add(i) };
138+
if byte == 0 {
139+
break;
140+
}
141+
if !byte.is_ascii_digit() {
142+
return None;
143+
}
144+
fd = fd.checked_mul(10)?.checked_add((byte - b'0') as i32)?;
145+
i += 1;
146+
}
147+
if i == 0 {
148+
return None;
149+
}
150+
Some(fd)
151+
}
152+
153+
/// Brute-force close all FDs from `first_fd` to `get_max_fd()`. Async-signal-safe.
154+
///
155+
/// Last-resort fallback when `close_range` and `/proc/self/fd` are unavailable.
156+
fn brute_force_close_fds(first_fd: i32) {
157+
let max_fd = get_max_fd();
158+
for fd in first_fd..max_fd {
159+
// SAFETY: close() is async-signal-safe. Closing a non-open FD returns EBADF (ignored).
160+
unsafe { libc::close(fd) };
161+
}
162+
}
9163

10164
/// Close all FDs from `first_fd` onwards. Async-signal-safe.
11165
///
@@ -15,7 +169,8 @@
15169
///
16170
/// # Safety
17171
///
18-
/// This function only uses async-signal-safe syscalls (close, syscall).
172+
/// This function only uses async-signal-safe syscalls (close, syscall,
173+
/// getrlimit, open, getdents64).
19174
/// Do NOT add:
20175
/// - Logging (tracing, println)
21176
/// - Memory allocation (Box, Vec, String)
@@ -29,7 +184,7 @@
29184
pub fn close_fds_from(first_fd: i32) -> Result<(), i32> {
30185
#[cfg(target_os = "linux")]
31186
{
32-
// Try close_range syscall (Linux 5.9+, most efficient)
187+
// Strategy 1: close_range syscall (Linux 5.9+, most efficient — O(1))
33188
let result = unsafe {
34189
libc::syscall(
35190
libc::SYS_close_range,
@@ -42,25 +197,24 @@ pub fn close_fds_from(first_fd: i32) -> Result<(), i32> {
42197
return Ok(());
43198
}
44199

45-
// Fallback: brute force close
46-
// Note: We can't use /proc/self/fd here because:
47-
// 1. read_dir allocates memory (not async-signal-safe)
48-
// 2. We might be in a mount namespace where /proc isn't mounted
49-
for fd in first_fd..1024 {
50-
// Ignore errors - FD might not be open
51-
unsafe { libc::close(fd) };
200+
// Strategy 2: Enumerate /proc/self/fd via raw getdents64 (no allocation).
201+
// Closes only open FDs — efficient even with high ulimit -n.
202+
if close_fds_via_proc(first_fd) {
203+
return Ok(());
52204
}
205+
206+
// Strategy 3: Brute-force close with dynamic limit from getrlimit.
207+
// Used when both close_range and /proc are unavailable (e.g., old kernel
208+
// in a mount namespace without /proc).
209+
brute_force_close_fds(first_fd);
53210
Ok(())
54211
}
55212

56213
#[cfg(target_os = "macos")]
57214
{
58-
// macOS: brute force close (no close_range syscall)
59-
// 4096 is a reasonable upper bound for most processes
60-
for fd in first_fd..4096 {
61-
// Ignore errors - FD might not be open
62-
unsafe { libc::close(fd) };
63-
}
215+
// macOS: brute-force close with dynamic limit from getrlimit.
216+
// Handles systems where ulimit -n exceeds the previous hardcoded 4096.
217+
brute_force_close_fds(first_fd);
64218
Ok(())
65219
}
66220

@@ -230,4 +384,92 @@ mod tests {
230384
child_close_fds_from_closes_target_and_above,
231385
);
232386
}
387+
388+
/// Create a high-numbered FD (above the old hardcoded 1024/4096 limits)
389+
/// and verify that `close_inherited_fds_raw` closes it.
390+
fn child_close_high_numbered_fd() -> i32 {
391+
// Raise the soft FD limit so we can dup2 to a high FD number.
392+
// Some systems default to ulimit -n 1024, which would prevent dup2 to FD 2000.
393+
let new_limit = libc::rlimit {
394+
rlim_cur: 4096,
395+
rlim_max: 4096,
396+
};
397+
// SAFETY: setrlimit is async-signal-safe. Raising soft limit within hard limit is allowed.
398+
if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &new_limit) } != 0 {
399+
// Can't raise limit (hard limit too low) — skip test gracefully
400+
return 0;
401+
}
402+
403+
// dup2 stdout to a high FD number that exceeds the old hardcoded limits.
404+
// Use 2000 on all platforms — above old Linux limit (1024) and reasonable.
405+
let high_fd: i32 = 2000;
406+
let result = unsafe { libc::dup2(STDOUT_FD, high_fd) };
407+
if result != high_fd {
408+
// dup2 failed — unexpected after raising ulimit
409+
return 1;
410+
}
411+
412+
// Verify the high FD is open
413+
if unsafe { libc::fcntl(high_fd, libc::F_GETFD) } < 0 {
414+
return 2;
415+
}
416+
417+
// Close all inherited FDs
418+
if close_inherited_fds_raw().is_err() {
419+
return 3;
420+
}
421+
422+
// The high FD should now be closed
423+
if unsafe { libc::fcntl(high_fd, libc::F_GETFD) } != -1 {
424+
return 4;
425+
}
426+
0
427+
}
428+
429+
#[test]
430+
fn test_close_high_numbered_fd() {
431+
run_in_child("test_close_high_numbered_fd", child_close_high_numbered_fd);
432+
}
433+
434+
#[test]
435+
fn test_get_max_fd_returns_positive() {
436+
let max = get_max_fd();
437+
assert!(
438+
max > 0,
439+
"get_max_fd should return positive value, got {}",
440+
max
441+
);
442+
assert!(
443+
max >= 256,
444+
"get_max_fd should return at least 256, got {}",
445+
max
446+
);
447+
}
448+
449+
#[cfg(target_os = "linux")]
450+
#[test]
451+
fn test_parse_fd_from_name() {
452+
// Valid numeric names
453+
assert_eq!(parse_fd_from_name(b"0\0".as_ptr()), Some(0));
454+
assert_eq!(parse_fd_from_name(b"3\0".as_ptr()), Some(3));
455+
assert_eq!(parse_fd_from_name(b"42\0".as_ptr()), Some(42));
456+
assert_eq!(parse_fd_from_name(b"1024\0".as_ptr()), Some(1024));
457+
assert_eq!(parse_fd_from_name(b"65535\0".as_ptr()), Some(65535));
458+
459+
// Non-numeric names (. and ..)
460+
assert_eq!(parse_fd_from_name(b".\0".as_ptr()), None);
461+
assert_eq!(parse_fd_from_name(b"..\0".as_ptr()), None);
462+
463+
// Empty name
464+
assert_eq!(parse_fd_from_name(b"\0".as_ptr()), None);
465+
466+
// Overflow: i32::MAX (2147483647) should succeed
467+
assert_eq!(parse_fd_from_name(b"2147483647\0".as_ptr()), Some(i32::MAX));
468+
469+
// Overflow: i32::MAX + 1 (2147483648) should return None (checked_add overflow)
470+
assert_eq!(parse_fd_from_name(b"2147483648\0".as_ptr()), None);
471+
472+
// Overflow: very large number should return None
473+
assert_eq!(parse_fd_from_name(b"99999999999\0".as_ptr()), None);
474+
}
233475
}

0 commit comments

Comments
 (0)