66//!
77//! Only the async-signal-safe `close_inherited_fds_raw()` is used,
88//! called from the `pre_exec` hook before exec().
9+ //!
10+ //! ## Strategy (in order of preference)
11+ //!
12+ //! 1. **Linux 5.9+**: `close_range(first_fd, ~0U, 0)` — O(1), kernel closes all.
13+ //! 2. **Linux < 5.9**: Enumerate `/proc/self/fd` via raw `getdents64` syscall
14+ //! (no memory allocation) and close only open FDs. Falls back to brute-force
15+ //! with a dynamic upper bound from `getrlimit(RLIMIT_NOFILE)`.
16+ //! 3. **macOS**: Brute-force close with dynamic upper bound from
17+ //! `getrlimit(RLIMIT_NOFILE)` instead of a hardcoded limit.
18+
19+ /// Query the soft limit for `RLIMIT_NOFILE`. Async-signal-safe.
20+ ///
21+ /// Returns the soft limit (current max open files), or a safe default
22+ /// if the query fails. `getrlimit` is async-signal-safe per POSIX.
23+ ///
24+ /// Note: Cannot reuse `rlimit::get_rlimit()` because it returns `io::Error`
25+ /// which heap-allocates (not async-signal-safe for pre_exec context).
26+ fn get_max_fd ( ) -> i32 {
27+ let mut rlim = libc:: rlimit {
28+ rlim_cur : 0 ,
29+ rlim_max : 0 ,
30+ } ;
31+ // SAFETY: getrlimit is async-signal-safe per POSIX. rlim is a valid stack-allocated struct.
32+ let result = unsafe { libc:: getrlimit ( libc:: RLIMIT_NOFILE , & mut rlim) } ;
33+ if result == 0 && rlim. rlim_cur > 0 {
34+ if rlim. rlim_cur < i32:: MAX as u64 {
35+ rlim. rlim_cur as i32
36+ } else {
37+ // rlim_cur is RLIM_INFINITY or very large; cap to a safe maximum.
38+ // 1048576 (2^20) matches Linux's default /proc/sys/fs/nr_open.
39+ 1_048_576
40+ }
41+ } else {
42+ // getrlimit failed or returned 0; safe default per POSIX OPEN_MAX
43+ 1024
44+ }
45+ }
46+
47+ /// Close open FDs by enumerating `/proc/self/fd` with raw `getdents64`.
48+ /// Async-signal-safe: uses only stack buffers and raw syscalls.
49+ ///
50+ /// Returns `true` if `/proc/self/fd` was successfully enumerated,
51+ /// `false` if it's unavailable (e.g., mount namespace without /proc).
52+ #[ cfg( target_os = "linux" ) ]
53+ fn close_fds_via_proc ( first_fd : i32 ) -> bool {
54+ // Open /proc/self/fd with raw syscall (no allocation)
55+ let proc_path = b"/proc/self/fd\0 " ;
56+ // SAFETY: proc_path is a valid null-terminated string literal. open() is async-signal-safe.
57+ let dir_fd = unsafe {
58+ libc:: open (
59+ proc_path. as_ptr ( ) . cast :: < libc:: c_char > ( ) ,
60+ libc:: O_RDONLY | libc:: O_DIRECTORY | libc:: O_CLOEXEC ,
61+ )
62+ } ;
63+ if dir_fd < 0 {
64+ return false ;
65+ }
66+
67+ // Stack buffer for getdents64 — 1024 bytes handles ~40 entries per call,
68+ // sufficient for typical processes without heap allocation.
69+ let mut buf = [ 0u8 ; 1024 ] ;
70+
71+ loop {
72+ let nread = unsafe {
73+ libc:: syscall (
74+ libc:: SYS_getdents64 ,
75+ dir_fd,
76+ buf. as_mut_ptr ( ) ,
77+ buf. len ( ) as libc:: c_uint ,
78+ )
79+ } ;
80+
81+ if nread <= 0 {
82+ break ;
83+ }
84+
85+ let mut offset = 0usize ;
86+ while offset < nread as usize {
87+ // SAFETY: getdents64 returns packed linux_dirent64 structs.
88+ // d_reclen is at byte offset 16. Use read_unaligned because the
89+ // buffer is a byte array and the u16 field may not be 2-byte aligned.
90+ let d_reclen =
91+ unsafe { buf. as_ptr ( ) . add ( offset + 16 ) . cast :: < u16 > ( ) . read_unaligned ( ) } as usize ;
92+
93+ if d_reclen == 0 || offset + d_reclen > nread as usize {
94+ break ;
95+ }
96+
97+ // d_name starts at offset 19 in linux_dirent64
98+ let name_ptr = unsafe { buf. as_ptr ( ) . add ( offset + 19 ) } ;
99+
100+ // Parse FD number from d_name (decimal string, null-terminated).
101+ // Async-signal-safe: no allocation, just pointer arithmetic.
102+ let fd = parse_fd_from_name ( name_ptr) ;
103+
104+ if let Some ( fd) = fd
105+ && fd >= first_fd
106+ && fd != dir_fd
107+ {
108+ // SAFETY: close() is async-signal-safe. Closing an already-closed FD is harmless.
109+ unsafe { libc:: close ( fd) } ;
110+ }
111+
112+ offset += d_reclen;
113+ }
114+ }
115+
116+ // SAFETY: close() is async-signal-safe. dir_fd was opened by us above.
117+ unsafe { libc:: close ( dir_fd) } ;
118+ true
119+ }
120+
121+ /// Parse a decimal FD number from a null-terminated C string pointer.
122+ /// Returns `None` for non-numeric entries (e.g., "." and "..").
123+ /// Async-signal-safe: no allocation, pure arithmetic.
124+ ///
125+ /// # Safety
126+ ///
127+ /// `ptr` must point to a valid, null-terminated byte string in readable memory.
128+ /// Called only from `close_fds_via_proc` with pointers into the `getdents64` buffer,
129+ /// which the kernel guarantees to contain null-terminated `d_name` fields.
130+ #[ cfg( target_os = "linux" ) ]
131+ fn parse_fd_from_name ( ptr : * const u8 ) -> Option < i32 > {
132+ let mut fd: i32 = 0 ;
133+ let mut i = 0usize ;
134+ loop {
135+ // SAFETY: ptr is guaranteed by the caller to point to a null-terminated string.
136+ // We stop reading at the null terminator (byte == 0).
137+ let byte = unsafe { * ptr. add ( i) } ;
138+ if byte == 0 {
139+ break ;
140+ }
141+ if !byte. is_ascii_digit ( ) {
142+ return None ;
143+ }
144+ fd = fd. checked_mul ( 10 ) ?. checked_add ( ( byte - b'0' ) as i32 ) ?;
145+ i += 1 ;
146+ }
147+ if i == 0 {
148+ return None ;
149+ }
150+ Some ( fd)
151+ }
152+
153+ /// Brute-force close all FDs from `first_fd` to `get_max_fd()`. Async-signal-safe.
154+ ///
155+ /// Last-resort fallback when `close_range` and `/proc/self/fd` are unavailable.
156+ fn brute_force_close_fds ( first_fd : i32 ) {
157+ let max_fd = get_max_fd ( ) ;
158+ for fd in first_fd..max_fd {
159+ // SAFETY: close() is async-signal-safe. Closing a non-open FD returns EBADF (ignored).
160+ unsafe { libc:: close ( fd) } ;
161+ }
162+ }
9163
10164/// Close all FDs from `first_fd` onwards. Async-signal-safe.
11165///
15169///
16170/// # Safety
17171///
18- /// This function only uses async-signal-safe syscalls (close, syscall).
172+ /// This function only uses async-signal-safe syscalls (close, syscall,
173+ /// getrlimit, open, getdents64).
19174/// Do NOT add:
20175/// - Logging (tracing, println)
21176/// - Memory allocation (Box, Vec, String)
29184pub fn close_fds_from ( first_fd : i32 ) -> Result < ( ) , i32 > {
30185 #[ cfg( target_os = "linux" ) ]
31186 {
32- // Try close_range syscall (Linux 5.9+, most efficient)
187+ // Strategy 1: close_range syscall (Linux 5.9+, most efficient — O(1) )
33188 let result = unsafe {
34189 libc:: syscall (
35190 libc:: SYS_close_range ,
@@ -42,25 +197,24 @@ pub fn close_fds_from(first_fd: i32) -> Result<(), i32> {
42197 return Ok ( ( ) ) ;
43198 }
44199
45- // Fallback: brute force close
46- // Note: We can't use /proc/self/fd here because:
47- // 1. read_dir allocates memory (not async-signal-safe)
48- // 2. We might be in a mount namespace where /proc isn't mounted
49- for fd in first_fd..1024 {
50- // Ignore errors - FD might not be open
51- unsafe { libc:: close ( fd) } ;
200+ // Strategy 2: Enumerate /proc/self/fd via raw getdents64 (no allocation).
201+ // Closes only open FDs — efficient even with high ulimit -n.
202+ if close_fds_via_proc ( first_fd) {
203+ return Ok ( ( ) ) ;
52204 }
205+
206+ // Strategy 3: Brute-force close with dynamic limit from getrlimit.
207+ // Used when both close_range and /proc are unavailable (e.g., old kernel
208+ // in a mount namespace without /proc).
209+ brute_force_close_fds ( first_fd) ;
53210 Ok ( ( ) )
54211 }
55212
56213 #[ cfg( target_os = "macos" ) ]
57214 {
58- // macOS: brute force close (no close_range syscall)
59- // 4096 is a reasonable upper bound for most processes
60- for fd in first_fd..4096 {
61- // Ignore errors - FD might not be open
62- unsafe { libc:: close ( fd) } ;
63- }
215+ // macOS: brute-force close with dynamic limit from getrlimit.
216+ // Handles systems where ulimit -n exceeds the previous hardcoded 4096.
217+ brute_force_close_fds ( first_fd) ;
64218 Ok ( ( ) )
65219 }
66220
@@ -230,4 +384,92 @@ mod tests {
230384 child_close_fds_from_closes_target_and_above,
231385 ) ;
232386 }
387+
388+ /// Create a high-numbered FD (above the old hardcoded 1024/4096 limits)
389+ /// and verify that `close_inherited_fds_raw` closes it.
390+ fn child_close_high_numbered_fd ( ) -> i32 {
391+ // Raise the soft FD limit so we can dup2 to a high FD number.
392+ // Some systems default to ulimit -n 1024, which would prevent dup2 to FD 2000.
393+ let new_limit = libc:: rlimit {
394+ rlim_cur : 4096 ,
395+ rlim_max : 4096 ,
396+ } ;
397+ // SAFETY: setrlimit is async-signal-safe. Raising soft limit within hard limit is allowed.
398+ if unsafe { libc:: setrlimit ( libc:: RLIMIT_NOFILE , & new_limit) } != 0 {
399+ // Can't raise limit (hard limit too low) — skip test gracefully
400+ return 0 ;
401+ }
402+
403+ // dup2 stdout to a high FD number that exceeds the old hardcoded limits.
404+ // Use 2000 on all platforms — above old Linux limit (1024) and reasonable.
405+ let high_fd: i32 = 2000 ;
406+ let result = unsafe { libc:: dup2 ( STDOUT_FD , high_fd) } ;
407+ if result != high_fd {
408+ // dup2 failed — unexpected after raising ulimit
409+ return 1 ;
410+ }
411+
412+ // Verify the high FD is open
413+ if unsafe { libc:: fcntl ( high_fd, libc:: F_GETFD ) } < 0 {
414+ return 2 ;
415+ }
416+
417+ // Close all inherited FDs
418+ if close_inherited_fds_raw ( ) . is_err ( ) {
419+ return 3 ;
420+ }
421+
422+ // The high FD should now be closed
423+ if unsafe { libc:: fcntl ( high_fd, libc:: F_GETFD ) } != -1 {
424+ return 4 ;
425+ }
426+ 0
427+ }
428+
429+ #[ test]
430+ fn test_close_high_numbered_fd ( ) {
431+ run_in_child ( "test_close_high_numbered_fd" , child_close_high_numbered_fd) ;
432+ }
433+
434+ #[ test]
435+ fn test_get_max_fd_returns_positive ( ) {
436+ let max = get_max_fd ( ) ;
437+ assert ! (
438+ max > 0 ,
439+ "get_max_fd should return positive value, got {}" ,
440+ max
441+ ) ;
442+ assert ! (
443+ max >= 256 ,
444+ "get_max_fd should return at least 256, got {}" ,
445+ max
446+ ) ;
447+ }
448+
449+ #[ cfg( target_os = "linux" ) ]
450+ #[ test]
451+ fn test_parse_fd_from_name ( ) {
452+ // Valid numeric names
453+ assert_eq ! ( parse_fd_from_name( b"0\0 " . as_ptr( ) ) , Some ( 0 ) ) ;
454+ assert_eq ! ( parse_fd_from_name( b"3\0 " . as_ptr( ) ) , Some ( 3 ) ) ;
455+ assert_eq ! ( parse_fd_from_name( b"42\0 " . as_ptr( ) ) , Some ( 42 ) ) ;
456+ assert_eq ! ( parse_fd_from_name( b"1024\0 " . as_ptr( ) ) , Some ( 1024 ) ) ;
457+ assert_eq ! ( parse_fd_from_name( b"65535\0 " . as_ptr( ) ) , Some ( 65535 ) ) ;
458+
459+ // Non-numeric names (. and ..)
460+ assert_eq ! ( parse_fd_from_name( b".\0 " . as_ptr( ) ) , None ) ;
461+ assert_eq ! ( parse_fd_from_name( b"..\0 " . as_ptr( ) ) , None ) ;
462+
463+ // Empty name
464+ assert_eq ! ( parse_fd_from_name( b"\0 " . as_ptr( ) ) , None ) ;
465+
466+ // Overflow: i32::MAX (2147483647) should succeed
467+ assert_eq ! ( parse_fd_from_name( b"2147483647\0 " . as_ptr( ) ) , Some ( i32 :: MAX ) ) ;
468+
469+ // Overflow: i32::MAX + 1 (2147483648) should return None (checked_add overflow)
470+ assert_eq ! ( parse_fd_from_name( b"2147483648\0 " . as_ptr( ) ) , None ) ;
471+
472+ // Overflow: very large number should return None
473+ assert_eq ! ( parse_fd_from_name( b"99999999999\0 " . as_ptr( ) ) , None ) ;
474+ }
233475}
0 commit comments