Auto merge of #108283 - the8472:remove-splice-into-pipe, r=ChrisDenton
don't splice from files into pipes in io::copy This fixes potential data ordering issue where a write performed after a copy operation could become visible in the copy even though it signaled completion. I assumed that by not setting `SPLICE_F_MOVE` we would be safe and the kernel would do a copy in kernel space and we could avoid the read-write syscall and copy-to/from-userspace costs. But apparently that flag only makes a difference when splicing from a pipe, but not when splicing into it. Context: https://lkml.org/lkml/2023/2/9/673
This commit is contained in:
commit
a29dada983
2 changed files with 88 additions and 16 deletions
|
@ -17,11 +17,9 @@
|
|||
//! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
|
||||
//! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
|
||||
//! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
|
||||
//! Since those syscalls have requirements that cannot be fully checked in advance and
|
||||
//! gathering additional information about file descriptors would require additional syscalls
|
||||
//! anyway it simply attempts to use them one after another (guided by inaccurate hints) to
|
||||
//! figure out which one works and falls back to the generic read-write copy loop if none of them
|
||||
//! does.
|
||||
//! Since those syscalls have requirements that cannot be fully checked in advance it attempts
|
||||
//! to use them one after another (guided by hints) to figure out which one works and
|
||||
//! falls back to the generic read-write copy loop if none of them does.
|
||||
//! Once a working syscall is found for a pair of file descriptors it will be called in a loop
|
||||
//! until the copy operation is completed.
|
||||
//!
|
||||
|
@ -84,14 +82,10 @@ pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
|
|||
/// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
|
||||
/// type may be wrong.
|
||||
enum FdMeta {
|
||||
/// We obtained the FD from a type that can contain any type of `FileType` and queried the metadata
|
||||
/// because it is cheaper than probing all possible syscalls (reader side)
|
||||
Metadata(Metadata),
|
||||
Socket,
|
||||
Pipe,
|
||||
/// We don't have any metadata, e.g. because the original type was `File` which can represent
|
||||
/// any `FileType` and we did not query the metadata either since it did not seem beneficial
|
||||
/// (writer side)
|
||||
/// We don't have any metadata because the stat syscall failed
|
||||
NoneObtained,
|
||||
}
|
||||
|
||||
|
@ -131,6 +125,39 @@ impl FdMeta {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns true either if changes made to the source after a sendfile/splice call won't become
|
||||
/// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing
|
||||
/// a file into a pipe, the pipe being the source in this case).
|
||||
///
|
||||
/// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold
|
||||
/// the Read/Write API semantics of io::copy.
|
||||
///
|
||||
/// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a
|
||||
/// regular file into a TcpSocket which will be treated as a socket here without checking.
|
||||
fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool {
|
||||
match (source, sink) {
|
||||
// Data arriving from a socket is safe because the sender can't modify the socket buffer.
|
||||
// Data arriving from a pipe is safe(-ish) because either the sender *copied*
|
||||
// the bytes into the pipe OR explicitly performed an operation that enables zero-copy,
|
||||
// thus promising not to modify the data later.
|
||||
(FdMeta::Socket, _) => true,
|
||||
(FdMeta::Pipe, _) => true,
|
||||
(FdMeta::Metadata(meta), _)
|
||||
if meta.file_type().is_fifo() || meta.file_type().is_socket() =>
|
||||
{
|
||||
true
|
||||
}
|
||||
// Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue
|
||||
// only happens for pages sitting in send buffers or pipes.
|
||||
(_, FdMeta::Metadata(meta))
|
||||
if !meta.file_type().is_fifo() && !meta.file_type().is_socket() =>
|
||||
{
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
struct CopyParams(FdMeta, Option<RawFd>);
|
||||
|
||||
struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
|
||||
|
@ -186,7 +213,8 @@ impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
|
|||
// So we just try and fallback if needed.
|
||||
// If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
|
||||
// fall back to the generic copy loop.
|
||||
if input_meta.potential_sendfile_source() {
|
||||
if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta)
|
||||
{
|
||||
let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
|
||||
result.update_take(reader);
|
||||
|
||||
|
@ -197,7 +225,9 @@ impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
|
|||
}
|
||||
}
|
||||
|
||||
if input_meta.maybe_fifo() || output_meta.maybe_fifo() {
|
||||
if (input_meta.maybe_fifo() || output_meta.maybe_fifo())
|
||||
&& safe_kernel_copy(&input_meta, &output_meta)
|
||||
{
|
||||
let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
|
||||
result.update_take(reader);
|
||||
|
||||
|
@ -298,13 +328,13 @@ impl CopyRead for &File {
|
|||
|
||||
impl CopyWrite for File {
|
||||
fn properties(&self) -> CopyParams {
|
||||
CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
|
||||
CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
|
||||
}
|
||||
}
|
||||
|
||||
impl CopyWrite for &File {
|
||||
fn properties(&self) -> CopyParams {
|
||||
CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
|
||||
CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -401,13 +431,13 @@ impl CopyRead for StdinLock<'_> {
|
|||
|
||||
impl CopyWrite for StdoutLock<'_> {
|
||||
fn properties(&self) -> CopyParams {
|
||||
CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
|
||||
CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
|
||||
}
|
||||
}
|
||||
|
||||
impl CopyWrite for StderrLock<'_> {
|
||||
fn properties(&self) -> CopyParams {
|
||||
CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
|
||||
CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -83,6 +83,48 @@ fn copies_append_mode_sink() -> Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dont_splice_pipes_from_files() -> Result<()> {
|
||||
// splicing to a pipe and then modifying the source could lead to changes
|
||||
// becoming visible in an unexpected order.
|
||||
|
||||
use crate::io::SeekFrom;
|
||||
use crate::os::unix::fs::FileExt;
|
||||
use crate::process::{ChildStdin, ChildStdout};
|
||||
use crate::sys_common::FromInner;
|
||||
|
||||
let (read_end, write_end) = crate::sys::pipe::anon_pipe()?;
|
||||
|
||||
let mut read_end = ChildStdout::from_inner(read_end);
|
||||
let mut write_end = ChildStdin::from_inner(write_end);
|
||||
|
||||
let tmp_path = tmpdir();
|
||||
let file = tmp_path.join("to_be_modified");
|
||||
let mut file =
|
||||
crate::fs::OpenOptions::new().create_new(true).read(true).write(true).open(file)?;
|
||||
|
||||
const SZ: usize = libc::PIPE_BUF as usize;
|
||||
|
||||
// put data in page cache
|
||||
let mut buf: [u8; SZ] = [0x01; SZ];
|
||||
file.write_all(&buf).unwrap();
|
||||
|
||||
// copy page into pipe
|
||||
file.seek(SeekFrom::Start(0)).unwrap();
|
||||
assert!(io::copy(&mut file, &mut write_end).unwrap() == SZ as u64);
|
||||
|
||||
// modify file
|
||||
buf[0] = 0x02;
|
||||
file.write_at(&buf, 0).unwrap();
|
||||
|
||||
// read from pipe
|
||||
read_end.read_exact(buf.as_mut_slice()).unwrap();
|
||||
|
||||
assert_eq!(buf[0], 0x01, "data in pipe should reflect the original, not later modifications");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_file_to_file_copy(b: &mut test::Bencher) {
|
||||
const BYTES: usize = 128 * 1024;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue