diff --git a/src/arch/mod.rs b/src/arch/mod.rs index ff73a98..8fd087c 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -94,27 +94,53 @@ pub fn read_string(pid: Pid, address: c_ulonglong) -> String { string } -pub fn enable_follow_forks(pid: Pid) -> nix::Result<()> { +pub fn ptrace_init_options(pid: Pid) -> nix::Result<()> { ptrace::setoptions( pid, - Options::PTRACE_O_TRACEFORK | Options::PTRACE_O_TRACEVFORK | Options::PTRACE_O_TRACECLONE, + Options::PTRACE_O_TRACESYSGOOD + | Options::PTRACE_O_TRACEEXIT + | Options::PTRACE_O_TRACEEXEC, ) } -pub fn parse_args(pid: Pid, syscall: Sysno, registers: user_regs_struct) -> SyscallArgs { - #[allow(clippy::cast_sign_loss)] - SyscallArgs( - SYSCALLS[syscall.id() as usize] - .1 - .iter() - .filter_map(Option::as_ref) - .enumerate() - .map(|(idx, arg)| (arg, get_arg_value(registers, idx))) - .map(|(arg, value)| match arg { - SyscallArgType::Int => SyscallArg::Int(value as i64), - SyscallArgType::Str => SyscallArg::Str(read_string(pid, value)), - SyscallArgType::Addr => SyscallArg::Addr(value as usize), - }) - .collect(), +pub fn ptrace_init_options_fork(pid: Pid) -> nix::Result<()> { + ptrace::setoptions( + pid, + Options::PTRACE_O_TRACESYSGOOD + | Options::PTRACE_O_TRACEEXIT + | Options::PTRACE_O_TRACEEXEC + | Options::PTRACE_O_TRACEFORK + | Options::PTRACE_O_TRACEVFORK + | Options::PTRACE_O_TRACECLONE, ) } + +#[allow(clippy::cast_sign_loss)] +#[must_use] +// SAFTEY: In get_register_data we make sure that the syscall number will never be negative. +pub fn parse_args(pid: Pid, syscall: Sysno, registers: user_regs_struct) -> SyscallArgs { + SYSCALLS + .get(syscall.id() as usize) + .and_then(|option| option.as_ref()) + .map_or_else( + || SyscallArgs(vec![]), + |(_, args)| { + SyscallArgs( + args.iter() + .filter_map(Option::as_ref) + .enumerate() + .map(|(idx, arg_type)| map_arg(pid, registers, idx, *arg_type)) + .collect(), + ) + }, + ) +} + +fn map_arg(pid: Pid, registers: user_regs_struct, idx: usize, arg: SyscallArgType) -> SyscallArg { + let value = get_arg_value(registers, idx); + match arg { + SyscallArgType::Int => SyscallArg::Int(value as i64), + SyscallArgType::Str => SyscallArg::Str(read_string(pid, value)), + SyscallArgType::Addr => SyscallArg::Addr(value as usize), + } +} diff --git a/src/arch/x86_64.rs b/src/arch/x86_64.rs index 3499c4c..a914003 100644 --- a/src/arch/x86_64.rs +++ b/src/arch/x86_64.rs @@ -227,6 +227,7 @@ pub static TRACE_SIGNAL: SysnoSet = SysnoSet::new(&[ signalfd, signalfd4, rt_tgsigqueueinfo, + pidfd_send_signal, ]); pub static TRACE_MEMORY: SysnoSet = SysnoSet::new(&[ @@ -285,25 +286,25 @@ pub static TRACE_CLOCK: SysnoSet = SysnoSet::new(&[ macro_rules! syscall { ($name:ident $(,)?) => { - (Sysno::$name, [None, None, None, None, None, None]) + Some((Sysno::$name, [None, None, None, None, None, None])) }; ($name:ident, $arg0:ident $(,)?) => { - (Sysno::$name, [$arg0, None, None, None, None, None]) + Some((Sysno::$name, [$arg0, None, None, None, None, None])) }; ($name:ident, $arg0:ident, $arg1:ident $(,)?) => { - (Sysno::$name, [$arg0, $arg1, None, None, None, None]) + Some((Sysno::$name, [$arg0, $arg1, None, None, None, None])) }; ($name:ident, $arg0:ident, $arg1:ident, $arg2:ident $(,)?) => { - (Sysno::$name, [$arg0, $arg1, $arg2, None, None, None]) + Some((Sysno::$name, [$arg0, $arg1, $arg2, None, None, None])) }; ($name:ident, $arg0:ident, $arg1:ident, $arg2:ident, $arg3:ident $(,)?) => { - (Sysno::$name, [$arg0, $arg1, $arg2, $arg3, None, None]) + Some((Sysno::$name, [$arg0, $arg1, $arg2, $arg3, None, None])) }; ($name:ident, $arg0:ident, $arg1:ident, $arg2:ident, $arg3:ident, $arg4:ident $(,)?) => { - (Sysno::$name, [$arg0, $arg1, $arg2, $arg3, $arg4, None]) + Some((Sysno::$name, [$arg0, $arg1, $arg2, $arg3, $arg4, None])) }; ($name:ident, $arg0:ident, $arg1:ident, $arg2:ident, $arg3:ident, $arg4:ident, $arg5:ident $(,)?) => { - (Sysno::$name, [$arg0, $arg1, $arg2, $arg3, $arg4, $arg5]) + Some((Sysno::$name, [$arg0, $arg1, $arg2, $arg3, $arg4, $arg5])) }; } @@ -311,7 +312,7 @@ const ADDR: Option = Some(SyscallArgType::Addr); const INT: Option = Some(SyscallArgType::Int); const STR: Option = Some(SyscallArgType::Str); -pub static SYSCALLS: [(Sysno, [Option; 6]); 335] = [ +pub static SYSCALLS: [Option<(Sysno, [Option; 6])>; 451] = [ // DESC syscall!(read, INT, STR, INT), // DESC @@ -888,6 +889,127 @@ pub static SYSCALLS: [(Sysno, [Option; 6]); 335] = [ syscall!(statx, INT, STR, INT, INT, STR), syscall!(io_pgetevents), syscall!(rseq), + // We jump from syscall number 334 to 424 here + // See: https://git.musl-libc.org/cgit/musl/commit/?id=f3f96f2daa4d00f0e38489fb465cd0244b531abe + // https://github.com/torvalds/linux/commit/0d6040d4681735dfc47565de288525de405a5c99 + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + syscall!(pidfd_send_signal, INT, INT, ADDR, INT), + syscall!(io_uring_setup, INT, ADDR), + syscall!(io_uring_enter, INT, INT, INT, INT, ADDR, INT), + syscall!(io_uring_register, INT, INT, ADDR, INT), + syscall!(open_tree, INT, STR, INT), + syscall!(move_mount, INT, STR, INT, STR, INT), + syscall!(fsopen, STR, INT), + syscall!(fsconfig, INT, INT, STR, ADDR, INT), + syscall!(fsmount, INT, INT, INT), + syscall!(fspick, INT, STR, INT), + syscall!(pidfd_open, INT, INT), + syscall!(clone3, ADDR, INT), + syscall!(close_range, INT, INT, INT), + syscall!(openat2, INT, STR, ADDR, INT), + syscall!(pidfd_getfd, INT, INT, INT), + syscall!(faccessat2, INT, STR, INT, INT), + syscall!(process_madvise, INT, ADDR, INT, INT, INT), + syscall!(epoll_pwait2, INT, ADDR, INT, ADDR, ADDR, INT), + syscall!(mount_setattr, INT, STR, INT, ADDR, INT), + syscall!(quotactl_fd, INT, INT, INT, ADDR), + syscall!(landlock_create_ruleset, ADDR, INT, INT), + syscall!(landlock_add_rule, INT, INT, ADDR, INT), + syscall!(landlock_restrict_self, INT, INT), + syscall!(memfd_secret, INT), + syscall!(process_mrelease, INT, INT), + syscall!(futex_waitv, ADDR, INT, INT, ADDR, INT), + syscall!(set_mempolicy_home_node, INT, INT, INT, INT), + // 451 - cachestat not yet implemented by the syscall crate + // syscall!(cachestat, INT, INT, INT, INT) ]; pub fn get_arg_value(registers: user_regs_struct, i: usize) -> c_ulonglong { @@ -908,9 +1030,12 @@ mod tests { use super::*; #[test] + #[allow(clippy::cast_sign_loss)] fn test_syscall_numbers() { - for (i, (sysno, ..)) in SYSCALLS.iter().enumerate() { - assert_eq!(i, sysno.id() as usize); + for (i, sysno, ..) in SYSCALLS.iter().enumerate() { + if let Some((sysno, _)) = sysno { + assert_eq!(i, sysno.id() as usize); + } } } } diff --git a/src/lib.rs b/src/lib.rs index ee03285..2a669c2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,13 @@ //! lurk is a pretty (simple) alternative to strace. #[deny(clippy::all, clippy::pedantic, clippy::format_push_string)] -// // TODO: re-check the casting lints - they might indicate an issue #[allow( clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::cast_precision_loss, - clippy::if_not_else, // FIXME: remove this clippy::redundant_closure_for_method_calls, - clippy::struct_excessive_bools, + clippy::struct_excessive_bools )] pub mod arch; pub mod args; @@ -20,10 +18,13 @@ use comfy_table::modifiers::UTF8_ROUND_CORNERS; use comfy_table::presets::UTF8_BORDERS_ONLY; use comfy_table::CellAlignment::Right; use comfy_table::{Cell, ContentArrangement, Row, Table}; +use libc::{user_regs_struct, PTRACE_SYSCALL_INFO_EXIT}; use linux_personality::{personality, ADDR_NO_RANDOMIZE}; -use nix::sys::ptrace; -use nix::sys::wait::wait; +use nix::sys::ptrace::{self, Event}; +use nix::sys::signal::Signal; +use nix::sys::wait::{wait, WaitStatus}; use nix::unistd::Pid; +use std::collections::HashMap; use std::fs::OpenOptions; use std::io::{BufWriter, Write}; use std::os::unix::process::CommandExt; @@ -32,7 +33,6 @@ use std::time::{Duration, SystemTime}; use syscalls::{Sysno, SysnoMap, SysnoSet}; use users::get_user_by_name; -use crate::arch::enable_follow_forks; use crate::args::{Args, Filter}; use crate::syscall_info::{RetCode, SyscallInfo}; @@ -88,85 +88,129 @@ impl Tracer { #[allow(clippy::too_many_lines)] pub fn run_tracer(&mut self) -> Result<()> { - // Use a flag to indicate if we have already set the needed options once in a loop (if required) - let mut follow_forks = self.args.follow_forks; - // If Some(t), we expect the next syscall to be the first call of a pair of syscalls - let mut syscall_start_time: Option = None; + // Create a hashmap to track entry and exit times across all forked processes individually. + let mut start_times = HashMap::>::new(); + start_times.insert(self.pid, None); + + let mut options_initialized = false; loop { - // Wait for the next system call - wait()?; + let status = wait()?; - // TODO: move this out of the loop if possible, or explain why can't - if follow_forks { - follow_forks = false; - enable_follow_forks(self.pid)?; + if !options_initialized { + if self.args.follow_forks { + arch::ptrace_init_options_fork(self.pid)?; + } else { + arch::ptrace_init_options(self.pid)?; + } + options_initialized = true; } - let Ok(registers) = ptrace::getregs(self.pid) else { - break - }; + match status { + // `WIFSTOPPED(status), signal is WSTOPSIG(status) + WaitStatus::Stopped(pid, signal) => { + // There are three reasons why a child might stop with SIGTRAP: + // 1) syscall entry + // 2) syscall exit + // 3) child calls exec + // + // Because we are tracing with PTRACE_O_TRACESYSGOOD, syscall entry and syscall exit + // are stopped in PtraceSyscall and not here, which means if we get a SIGTRAP here, + // it's because the child called exec. + if signal == Signal::SIGTRAP { + self.log_standard_syscall(pid, None, None)?; + self.issue_ptrace_syscall_request(pid, None)?; + continue; + } - // FIXME: what is 336??? The highest syscall we have is rseq = 334 - // per syscalls crate, there is a big gap after rseq until pidfd_send_signal = 424 - if registers.orig_rax >= 336 { - continue; - } - let Ok(sys_no) = (registers.orig_rax as u32).try_into() else { - continue - }; + // If we trace with PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE, + // a created child of our tracee will stop with SIGSTOP. + // If our tracee creates children of their own, we want to trace their syscall times with a new value. + if signal == Signal::SIGSTOP { + if self.args.follow_forks { + start_times.insert(pid, None); - // ptrace gets invoked twice per system call: once before and once after execution - // only print output at second ptrace invocation - // TODO: explain why these two syscalls should be handled differently? - // TODO: should we handle if two subsequent syscalls are NOT the same? - if syscall_start_time.is_some() - || sys_no == Sysno::execve - || sys_no == Sysno::exit_group - { - let ret_code = RetCode::from_raw(registers.rax); - if let RetCode::Err(_) = ret_code { - self.syscalls_fail[sys_no] += 1 - } else { - self.syscalls_pass[sys_no] += 1 - }; - - if self.filter.matches(sys_no, ret_code) { - // Measure system call execution time - let elapsed = if let Some(start_time) = syscall_start_time { - let elapsed = SystemTime::now() - .duration_since(start_time) - .unwrap_or_default(); - self.syscalls_time[sys_no] += elapsed; - elapsed - } else { - Duration::default() - }; - - if !self.args.summary_only { - // TODO: if we follow forks, we should also capture/print the pid of the child process - let info = SyscallInfo::new(self.pid, sys_no, ret_code, registers, elapsed); - if self.args.json { - let json = serde_json::to_string(&info)?; - writeln!(&mut self.output, "{json}")?; - } else { - info.write_syscall( - self.use_colors, - self.string_limit, - self.args.syscall_number, - self.args.syscall_times, - &mut self.output, - )?; + if !self.args.summary_only { + writeln!(&mut self.output, "Attaching to child {}", pid,)?; + } } + + self.issue_ptrace_syscall_request(pid, None)?; + continue; + } + + // The SIGCHLD signal is sent to a process when a child process terminates, interrupted, or resumes after being interrupted + // This means, that if our tracee forked and said fork exits before the parent, the parent will get stopped. + // Therefor issue a PTRACE_SYSCALL request to the parent to continue execution. + // This is also important if we trace without the following forks option. + if signal == Signal::SIGCHLD { + self.issue_ptrace_syscall_request(pid, None)?; + continue; + } + + // If we fall through to here, we have another signal that's been sent to the tracee, + // in this case, just forward the singal to the tracee to let it handle it. + // TODO: Finer signal handling, edge-cases etc. + ptrace::cont(pid, signal)?; + } + // WIFEXITED(status) + WaitStatus::Exited(pid, _) => { + // If the process that exits is the original tracee, we can safely break here, + // but we need to continue if the process that exits is a child of the original tracee. + if self.pid == pid { break } else { continue }; + } + // The traced process was stopped by a `PTRACE_EVENT_*` event. + WaitStatus::PtraceEvent(pid, _, code) => { + // We stop at the PTRACE_EVENT_EXIT event because of the PTRACE_O_TRACEEXIT option. + // We do this to properly catch and log exit-family syscalls, which do not have an PTRACE_SYSCALL_INFO_EXIT event. + if code == Event::PTRACE_EVENT_EXIT as i32 && self.is_exit_syscall(pid)? { + self.log_standard_syscall(pid, None, None)?; } + + self.issue_ptrace_syscall_request(pid, None)?; } - syscall_start_time = None; - } else { - syscall_start_time = Some(SystemTime::now()); - } + // Tracee is traced with the PTRACE_O_TRACESYSGOOD option. + WaitStatus::PtraceSyscall(pid) => { + // ptrace(PTRACE_GETEVENTMSG,...) can be one of three values here: + // 1) PTRACE_SYSCALL_INFO_NONE + // 2) PTRACE_SYSCALL_INFO_ENTRY + // 3) PTRACE_SYSCALL_INFO_EXIT + let event = ptrace::getevent(pid)? as u8; + + // Snapshot current time, to avoid polluting the syscall time with + // non-syscall related latency. + let timestamp = Some(SystemTime::now()); + + // We only want to log regular syscalls on exit + if let Some(syscall_start_time) = start_times.get_mut(&pid) { + if event == PTRACE_SYSCALL_INFO_EXIT { + self.log_standard_syscall(pid, *syscall_start_time, timestamp)?; + *syscall_start_time = None; + } else { + *syscall_start_time = timestamp; + } + } else { + return Err(anyhow!("Unable to get start time for tracee {}", pid)); + } - if ptrace::syscall(self.pid, None).is_err() { - break; + self.issue_ptrace_syscall_request(pid, None)?; + } + // WIFSIGNALED(status), signal is WTERMSIG(status) and coredump is WCOREDUMP(status) + WaitStatus::Signaled(pid, signal, coredump) => { + writeln!( + &mut self.output, + "Child {} terminated by signal {} {}", + pid, + signal, + if coredump { "(core dumped)" } else { "" } + )?; + break; + } + // WIFCONTINUED(status), this usually happens when a process receives a SIGCONT. + // Just continue with the next iteration of the loop. + WaitStatus::Continued(_) | WaitStatus::StillAlive => { + continue; + } } } @@ -189,6 +233,7 @@ impl Tracer { .apply_modifier(UTF8_ROUND_CORNERS) .set_content_arrangement(ContentArrangement::Dynamic) .set_header(&headers); + for i in 0..headers.len() { table.column_mut(i).unwrap().set_cell_alignment(Right); } @@ -264,6 +309,102 @@ impl Tracer { Ok(()) } + + fn log_standard_syscall( + &mut self, + pid: Pid, + syscall_start_time: Option, + syscall_end_time: Option, + ) -> Result<()> { + let (syscall_number, registers) = self.parse_register_data(pid)?; + + // Theres no PTRACE_SYSCALL_INFO_EXIT for an exit-family syscall, hence ret_code will always be 0xffffffffffffffda (which is -38) + // -38 is ENOSYS which is put into RAX as a default return value by the kernel's syscall entry code. + // In order to not pollute the summary with this false positive, avoid exit-family syscalls from being counted (same behaviour as strace). + let ret_code = match syscall_number { + Sysno::exit | Sysno::exit_group => RetCode::from_raw(0), + _ => { + let code = RetCode::from_raw(registers.rax); + match code { + RetCode::Err(_) => self.syscalls_fail[syscall_number] += 1, + _ => self.syscalls_pass[syscall_number] += 1, + } + code + } + }; + + if self.filter.matches(syscall_number, ret_code) { + let elapsed = syscall_start_time.map_or(Duration::default(), |start_time| { + let end_time = syscall_end_time.unwrap_or(SystemTime::now()); + end_time.duration_since(start_time).unwrap_or_default() + }); + + if syscall_start_time.is_some() { + self.syscalls_time[syscall_number] += elapsed; + } + + if !self.args.summary_only { + let info = SyscallInfo::new(pid, syscall_number, ret_code, registers, elapsed); + self.write_syscall_info(&info)?; + } + } + + Ok(()) + } + + fn write_syscall_info(&mut self, info: &SyscallInfo) -> Result<()> { + if self.args.json { + let json = serde_json::to_string(&info)?; + Ok(writeln!(&mut self.output, "{json}")?) + } else { + info.write_syscall( + self.use_colors, + self.string_limit, + self.args.syscall_number, + self.args.syscall_times, + &mut self.output, + ) + } + } + + // Issue a PTRACE_SYSCALL request to the tracee, forwarding a signal if one is provided. + fn issue_ptrace_syscall_request(&self, pid: Pid, signal: Option) -> Result<()> { + ptrace::syscall(pid, signal).map_err(|_| + anyhow!( + "Unable to issue a PTRACE_SYSCALL request in tracee {}", + pid + ) + ) + } + + // TODO: This is arch-specific code and should be modularized + fn get_registers(&self, pid: Pid) -> Result { + ptrace::getregs(pid).map_err(|_| + anyhow!( + "Unable to get registers from tracee {}", + pid + ) + ) + } + + fn get_syscall(&self, registers: user_regs_struct) -> Result { + (registers.orig_rax as u32).try_into() + .map_err(|_| anyhow!("Invalid syscall number {}", registers.orig_rax)) + } + + // Issues a ptrace(PTRACE_GETREGS, ...) request and gets the corresponding syscall number (Sysno). + fn parse_register_data(&self, pid: Pid) -> Result<(Sysno, user_regs_struct)> { + let registers = self.get_registers(pid)?; + let syscall_number = self.get_syscall(registers)?; + + Ok((syscall_number, registers)) + } + + fn is_exit_syscall(&self, pid: Pid) -> Result { + self.get_registers(pid).map(|registers| { + registers.orig_rax == Sysno::exit as u64 || registers.orig_rax == Sysno::exit_group as u64 + }) + } } pub fn run_tracee(command: &[String], envs: &[String], username: &Option) -> Result<()> { diff --git a/src/syscall_info.rs b/src/syscall_info.rs index 5076f1d..db4b61b 100644 --- a/src/syscall_info.rs +++ b/src/syscall_info.rs @@ -71,23 +71,27 @@ impl SyscallInfo { arg.write(output, string_limit)?; } write!(output, ") = ")?; - if use_colors { - let style = match self.result { - RetCode::Ok(_) => Green.bold(), - RetCode::Err(_) => Red.bold(), - RetCode::Address(_) => Yellow.bold(), - }; - // TODO: it would be great if we can force termcolor to write - // the styling prefix and suffix into the formatter. - // This would allow us to use the same code for both cases, - // and avoid additional string alloc - write!(output, "{}", style.paint(self.result.to_string())) + if self.syscall == Sysno::exit || self.syscall == Sysno::exit_group { + write!(output, "?")?; } else { - write!(output, "{}", self.result) - }?; - if show_duration { - // TODO: add an option to control each syscall duration scaling, e.g. ms, us, ns - write!(output, " <{:.6}ns>", self.duration.as_nanos())?; + if use_colors { + let style = match self.result { + RetCode::Ok(_) => Green.bold(), + RetCode::Err(_) => Red.bold(), + RetCode::Address(_) => Yellow.bold(), + }; + // TODO: it would be great if we can force termcolor to write + // the styling prefix and suffix into the formatter. + // This would allow us to use the same code for both cases, + // and avoid additional string alloc + write!(output, "{}", style.paint(self.result.to_string())) + } else { + write!(output, "{}", self.result) + }?; + if show_duration { + // TODO: add an option to control each syscall duration scaling, e.g. ms, us, ns + write!(output, " <{:.6}ns>", self.duration.as_nanos())?; + } } Ok(writeln!(output)?) }