Skip to content

Commit 3dfc377

Browse files
committed
move sendfile/splice/copy_file_range into kernel_copy module
1 parent 888b103 commit 3dfc377

File tree

3 files changed

+150
-153
lines changed

3 files changed

+150
-153
lines changed

library/std/src/sys/unix/fs.rs

+2-149
Original file line numberDiff line numberDiff line change
@@ -1195,6 +1195,8 @@ pub fn copy(from: &Path, to: &Path) -> io::Result<u64> {
11951195
let max_len = u64::MAX;
11961196
let (mut writer, _) = open_to_and_set_permissions(to, reader_metadata)?;
11971197

1198+
use super::kernel_copy::{copy_regular_files, CopyResult};
1199+
11981200
match copy_regular_files(reader.as_raw_fd(), writer.as_raw_fd(), max_len) {
11991201
CopyResult::Ended(result) => result,
12001202
CopyResult::Fallback(written) => {
@@ -1206,155 +1208,6 @@ pub fn copy(from: &Path, to: &Path) -> io::Result<u64> {
12061208
}
12071209
}
12081210

1209-
/// linux-specific implementation that will attempt to use copy_file_range for copy offloading
1210-
/// as the name says, it only works on regular files
1211-
///
1212-
/// Callers must handle fallback to a generic copy loop.
1213-
/// `Fallback` may indicate non-zero number of bytes already written
1214-
/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
1215-
/// If the initial file offset was 0 then `Fallback` will only contain `0`.
1216-
#[cfg(any(target_os = "linux", target_os = "android"))]
1217-
pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
1218-
use crate::cmp;
1219-
use crate::sync::atomic::{AtomicBool, Ordering};
1220-
1221-
// Kernel prior to 4.5 don't have copy_file_range
1222-
// We store the availability in a global to avoid unnecessary syscalls
1223-
static HAS_COPY_FILE_RANGE: AtomicBool = AtomicBool::new(true);
1224-
1225-
unsafe fn copy_file_range(
1226-
fd_in: libc::c_int,
1227-
off_in: *mut libc::loff_t,
1228-
fd_out: libc::c_int,
1229-
off_out: *mut libc::loff_t,
1230-
len: libc::size_t,
1231-
flags: libc::c_uint,
1232-
) -> libc::c_long {
1233-
libc::syscall(libc::SYS_copy_file_range, fd_in, off_in, fd_out, off_out, len, flags)
1234-
}
1235-
1236-
let has_copy_file_range = HAS_COPY_FILE_RANGE.load(Ordering::Relaxed);
1237-
let mut written = 0u64;
1238-
while written < max_len {
1239-
let copy_result = if has_copy_file_range {
1240-
let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
1241-
// cap to 2GB chunks in case u64::MAX is passed in as file size and the file has a non-zero offset
1242-
// this allows us to copy large chunks without hitting the limit,
1243-
// unless someone sets a file offset close to u64::MAX - 2GB, in which case the fallback would kick in
1244-
let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x8000_0000usize);
1245-
let copy_result = unsafe {
1246-
// We actually don't have to adjust the offsets,
1247-
// because copy_file_range adjusts the file offset automatically
1248-
cvt(copy_file_range(
1249-
reader,
1250-
ptr::null_mut(),
1251-
writer,
1252-
ptr::null_mut(),
1253-
bytes_to_copy,
1254-
0,
1255-
))
1256-
};
1257-
if let Err(ref copy_err) = copy_result {
1258-
match copy_err.raw_os_error() {
1259-
Some(libc::ENOSYS | libc::EPERM | libc::EOPNOTSUPP) => {
1260-
HAS_COPY_FILE_RANGE.store(false, Ordering::Relaxed);
1261-
}
1262-
_ => {}
1263-
}
1264-
}
1265-
copy_result
1266-
} else {
1267-
Err(io::Error::from_raw_os_error(libc::ENOSYS))
1268-
};
1269-
match copy_result {
1270-
Ok(0) if written == 0 => {
1271-
// fallback to work around several kernel bugs where copy_file_range will fail to
1272-
// copy any bytes and return 0 instead of an error if
1273-
// - reading virtual files from the proc filesystem which appear to have 0 size
1274-
// but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
1275-
// - copying from an overlay filesystem in docker. reported to occur on fedora 32.
1276-
return CopyResult::Fallback(0);
1277-
}
1278-
Ok(0) => return CopyResult::Ended(Ok(written)), // reached EOF
1279-
Ok(ret) => written += ret as u64,
1280-
Err(err) => {
1281-
match err.raw_os_error() {
1282-
// when file offset + max_length > u64::MAX
1283-
Some(libc::EOVERFLOW) => return CopyResult::Fallback(written),
1284-
Some(
1285-
libc::ENOSYS | libc::EXDEV | libc::EINVAL | libc::EPERM | libc::EOPNOTSUPP,
1286-
) => {
1287-
// Try fallback io::copy if either:
1288-
// - Kernel version is < 4.5 (ENOSYS)
1289-
// - Files are mounted on different fs (EXDEV)
1290-
// - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
1291-
// - copy_file_range is disallowed, for example by seccomp (EPERM)
1292-
// - copy_file_range cannot be used with pipes or device nodes (EINVAL)
1293-
assert_eq!(written, 0);
1294-
return CopyResult::Fallback(0);
1295-
}
1296-
_ => return CopyResult::Ended(Err(err)),
1297-
}
1298-
}
1299-
}
1300-
}
1301-
CopyResult::Ended(Ok(written))
1302-
}
1303-
1304-
#[derive(PartialEq)]
1305-
pub(super) enum SpliceMode {
1306-
Sendfile,
1307-
Splice,
1308-
}
1309-
1310-
pub(super) enum CopyResult {
1311-
Ended(io::Result<u64>),
1312-
Fallback(u64),
1313-
}
1314-
1315-
/// performs splice or sendfile between file descriptors
1316-
/// Does _not_ fall back to a generic copy loop.
1317-
#[cfg(any(target_os = "linux", target_os = "android"))]
1318-
pub(super) fn sendfile_splice(
1319-
mode: SpliceMode,
1320-
reader: RawFd,
1321-
writer: RawFd,
1322-
len: u64,
1323-
) -> CopyResult {
1324-
let mut written = 0u64;
1325-
while written < len {
1326-
let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
1327-
1328-
let result = match mode {
1329-
SpliceMode::Sendfile => {
1330-
cvt(unsafe { libc::sendfile(writer, reader, ptr::null_mut(), chunk_size) })
1331-
}
1332-
SpliceMode::Splice => cvt(unsafe {
1333-
libc::splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
1334-
}),
1335-
};
1336-
1337-
match result {
1338-
Ok(0) => break, // EOF
1339-
Ok(ret) => written += ret as u64,
1340-
Err(err) => {
1341-
match err.raw_os_error() {
1342-
Some(os_err) if os_err == libc::EINVAL => {
1343-
// splice/sendfile do not support this particular file descritor (EINVAL)
1344-
assert_eq!(written, 0);
1345-
return CopyResult::Fallback(0);
1346-
}
1347-
Some(os_err) if mode == SpliceMode::Sendfile && os_err == libc::EOVERFLOW => {
1348-
return CopyResult::Fallback(written);
1349-
}
1350-
_ => return CopyResult::Ended(Err(err)),
1351-
}
1352-
}
1353-
}
1354-
}
1355-
CopyResult::Ended(Ok(written))
1356-
}
1357-
13581211
#[cfg(any(target_os = "macos", target_os = "ios"))]
13591212
pub fn copy(from: &Path, to: &Path) -> io::Result<u64> {
13601213
use crate::sync::atomic::{AtomicBool, Ordering};

library/std/src/sys/unix/kernel_copy.rs

+146-2
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,16 @@ use crate::convert::TryInto;
4949
use crate::fs::{File, Metadata};
5050
use crate::io::copy::generic_copy;
5151
use crate::io::{
52-
BufRead, BufReader, BufWriter, Read, Result, StderrLock, StdinLock, StdoutLock, Take, Write,
52+
BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
53+
Write,
5354
};
5455
use crate::mem::ManuallyDrop;
5556
use crate::net::TcpStream;
5657
use crate::os::unix::fs::FileTypeExt;
5758
use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
5859
use crate::process::{ChildStderr, ChildStdin, ChildStdout};
59-
use crate::sys::fs::{copy_regular_files, sendfile_splice, CopyResult, SpliceMode};
60+
use crate::ptr;
61+
use crate::sys::cvt;
6062

6163
#[cfg(test)]
6264
mod tests;
@@ -423,3 +425,145 @@ fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
423425
Err(_) => FdMeta::NoneObtained,
424426
}
425427
}
428+
429+
pub(super) enum CopyResult {
430+
Ended(Result<u64>),
431+
Fallback(u64),
432+
}
433+
434+
/// linux-specific implementation that will attempt to use copy_file_range for copy offloading
435+
/// as the name says, it only works on regular files
436+
///
437+
/// Callers must handle fallback to a generic copy loop.
438+
/// `Fallback` may indicate non-zero number of bytes already written
439+
/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
440+
/// If the initial file offset was 0 then `Fallback` will only contain `0`.
441+
pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
442+
use crate::cmp;
443+
use crate::sync::atomic::{AtomicBool, Ordering};
444+
445+
// Kernel prior to 4.5 don't have copy_file_range
446+
// We store the availability in a global to avoid unnecessary syscalls
447+
static HAS_COPY_FILE_RANGE: AtomicBool = AtomicBool::new(true);
448+
449+
unsafe fn copy_file_range(
450+
fd_in: libc::c_int,
451+
off_in: *mut libc::loff_t,
452+
fd_out: libc::c_int,
453+
off_out: *mut libc::loff_t,
454+
len: libc::size_t,
455+
flags: libc::c_uint,
456+
) -> libc::c_long {
457+
libc::syscall(libc::SYS_copy_file_range, fd_in, off_in, fd_out, off_out, len, flags)
458+
}
459+
460+
let has_copy_file_range = HAS_COPY_FILE_RANGE.load(Ordering::Relaxed);
461+
let mut written = 0u64;
462+
while written < max_len {
463+
let copy_result = if has_copy_file_range {
464+
let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
465+
// cap to 2GB chunks in case u64::MAX is passed in as file size and the file has a non-zero offset
466+
// this allows us to copy large chunks without hitting the limit,
467+
// unless someone sets a file offset close to u64::MAX - 2GB, in which case the fallback would kick in
468+
let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x8000_0000usize);
469+
let copy_result = unsafe {
470+
// We actually don't have to adjust the offsets,
471+
// because copy_file_range adjusts the file offset automatically
472+
cvt(copy_file_range(
473+
reader,
474+
ptr::null_mut(),
475+
writer,
476+
ptr::null_mut(),
477+
bytes_to_copy,
478+
0,
479+
))
480+
};
481+
if let Err(ref copy_err) = copy_result {
482+
match copy_err.raw_os_error() {
483+
Some(libc::ENOSYS | libc::EPERM | libc::EOPNOTSUPP) => {
484+
HAS_COPY_FILE_RANGE.store(false, Ordering::Relaxed);
485+
}
486+
_ => {}
487+
}
488+
}
489+
copy_result
490+
} else {
491+
Err(Error::from_raw_os_error(libc::ENOSYS))
492+
};
493+
match copy_result {
494+
Ok(0) if written == 0 => {
495+
// fallback to work around several kernel bugs where copy_file_range will fail to
496+
// copy any bytes and return 0 instead of an error if
497+
// - reading virtual files from the proc filesystem which appear to have 0 size
498+
// but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
499+
// - copying from an overlay filesystem in docker. reported to occur on fedora 32.
500+
return CopyResult::Fallback(0);
501+
}
502+
Ok(0) => return CopyResult::Ended(Ok(written)), // reached EOF
503+
Ok(ret) => written += ret as u64,
504+
Err(err) => {
505+
return match err.raw_os_error() {
506+
// when file offset + max_length > u64::MAX
507+
Some(libc::EOVERFLOW) => CopyResult::Fallback(written),
508+
Some(
509+
libc::ENOSYS | libc::EXDEV | libc::EINVAL | libc::EPERM | libc::EOPNOTSUPP,
510+
) => {
511+
// Try fallback io::copy if either:
512+
// - Kernel version is < 4.5 (ENOSYS)
513+
// - Files are mounted on different fs (EXDEV)
514+
// - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
515+
// - copy_file_range is disallowed, for example by seccomp (EPERM)
516+
// - copy_file_range cannot be used with pipes or device nodes (EINVAL)
517+
assert_eq!(written, 0);
518+
CopyResult::Fallback(0)
519+
}
520+
_ => CopyResult::Ended(Err(err)),
521+
};
522+
}
523+
}
524+
}
525+
CopyResult::Ended(Ok(written))
526+
}
527+
528+
#[derive(PartialEq)]
529+
enum SpliceMode {
530+
Sendfile,
531+
Splice,
532+
}
533+
534+
/// performs splice or sendfile between file descriptors
535+
/// Does _not_ fall back to a generic copy loop.
536+
fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
537+
let mut written = 0u64;
538+
while written < len {
539+
let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
540+
541+
let result = match mode {
542+
SpliceMode::Sendfile => {
543+
cvt(unsafe { libc::sendfile(writer, reader, ptr::null_mut(), chunk_size) })
544+
}
545+
SpliceMode::Splice => cvt(unsafe {
546+
libc::splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
547+
}),
548+
};
549+
550+
match result {
551+
Ok(0) => break, // EOF
552+
Ok(ret) => written += ret as u64,
553+
Err(err) => {
554+
return match err.raw_os_error() {
555+
Some(os_err) if os_err == libc::EINVAL => {
556+
// splice/sendfile do not support this particular file descritor (EINVAL)
557+
assert_eq!(written, 0);
558+
CopyResult::Fallback(0)
559+
}
560+
Some(os_err) if mode == SpliceMode::Sendfile && os_err == libc::EOVERFLOW => {
561+
CopyResult::Fallback(written)
562+
}
563+
_ => CopyResult::Ended(Err(err)),
564+
};
565+
}
566+
}
567+
}
568+
CopyResult::Ended(Ok(written))
569+
}

library/std/src/sys/unix/kernel_copy/tests.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ fn bench_socket_pipe_socket_copy(b: &mut test::Bencher) {
159159
let local_source = local_end.clone();
160160
crate::thread::spawn(move || {
161161
loop {
162-
crate::sys::fs::sendfile_splice(
163-
crate::sys::fs::SpliceMode::Splice,
162+
super::sendfile_splice(
163+
super::SpliceMode::Splice,
164164
local_source.as_raw_fd(),
165165
write_end.as_raw_fd(),
166166
u64::MAX,

0 commit comments

Comments
 (0)