diff --git a/crates/ark/src/shell.rs b/crates/ark/src/shell.rs index 01cfe2ebe..cd60edc5a 100644 --- a/crates/ark/src/shell.rs +++ b/crates/ark/src/shell.rs @@ -115,7 +115,7 @@ impl Shell { &self, req: &IsCompleteRequest, ) -> Result { - match harp::parse_status(req.code.as_str()) { + match harp::parse_status(&harp::ParseInput::Text(req.code.as_str())) { Ok(ParseResult::Complete(_)) => Ok(IsCompleteReply { status: IsComplete::Complete, indent: String::from(""), diff --git a/crates/harp/src/lib.rs b/crates/harp/src/lib.rs index 2b4ea136e..154c21069 100644 --- a/crates/harp/src/lib.rs +++ b/crates/harp/src/lib.rs @@ -19,6 +19,7 @@ pub mod line_ending; pub mod modules; pub mod object; pub mod parse; +pub mod parser; pub mod polled_events; pub mod protect; pub mod r_version; @@ -41,6 +42,7 @@ pub mod vector; pub use eval::*; pub use object::*; pub use parse::*; +pub use parser::*; pub use source::*; pub use table::*; pub use vector::list::*; diff --git a/crates/harp/src/parse.rs b/crates/harp/src/parse.rs index 1169b4c3b..d44a8f881 100644 --- a/crates/harp/src/parse.rs +++ b/crates/harp/src/parse.rs @@ -7,22 +7,38 @@ use std::ffi::CStr; -use libr::SEXP; +use itertools::Itertools; -use crate::exec::RFunction; -use crate::exec::RFunctionExt; use crate::line_ending::convert_line_endings; use crate::line_ending::LineEnding; use crate::protect::RProtect; use crate::r_string; +use crate::srcref; use crate::try_catch; +use crate::vector::CharacterVector; +use crate::vector::Vector; use crate::RObject; +pub struct RParseOptions { + pub srcfile: Option, +} + pub enum ParseResult { - Complete(SEXP), + Complete(RObject), Incomplete, } +pub enum ParseInput<'a> { + Text(&'a str), + SrcFile(RObject), +} + +impl Default for RParseOptions { + fn default() -> Self { + Self { srcfile: None } + } +} + /// Returns a single expression pub fn parse_expr(code: &str) -> crate::Result { unsafe { @@ -42,47 +58,43 @@ pub fn parse_expr(code: &str) -> crate::Result { } /// Returns an EXPRSXP vector -pub fn parse_exprs(code: &str) -> crate::Result { - match parse_status(code)? { - ParseResult::Complete(x) => { - return Ok(RObject::from(x)); - }, - ParseResult::Incomplete => { - return Err(crate::Error::ParseError { - code: code.to_string(), - message: String::from("Incomplete code"), - }); - }, - }; +pub fn parse_exprs(text: &str) -> crate::Result { + parse_exprs_ext(&ParseInput::Text(text)) } -/// This uses the R-level function `parse()` to create the srcrefs -pub fn parse_exprs_with_srcrefs(code: &str) -> crate::Result { - unsafe { - let mut protect = RProtect::new(); - - // Because `parse(text =)` doesn't allow `\r\n` even on Windows - let code = convert_line_endings(code, LineEnding::Posix); - let code = r_string!(code, protect); +/// Same but creates srcrefs +pub fn parse_exprs_with_srcrefs(text: &str) -> crate::Result { + let srcfile = srcref::new_srcfile_virtual(text)?; + parse_exprs_ext(&ParseInput::SrcFile(srcfile)) +} - RFunction::new("base", "parse") - .param("text", code) - .param("keep.source", true) - .call() +fn parse_exprs_ext<'a>(input: &ParseInput<'a>) -> crate::Result { + let status = parse_status(input)?; + match status { + ParseResult::Complete(x) => Ok(RObject::from(x)), + ParseResult::Incomplete => Err(crate::Error::ParseError { + code: parse_input_as_string(input).unwrap_or(String::from("Concersion error")), + message: String::from("Incomplete code"), + }), } } -pub fn parse_status(code: &str) -> crate::Result { +pub fn parse_status<'a>(input: &ParseInput<'a>) -> crate::Result { unsafe { - let mut ps: libr::ParseStatus = libr::ParseStatus_PARSE_NULL; - let mut protect = RProtect::new(); - let r_code = r_string!(convert_line_endings(code, LineEnding::Posix), &mut protect); + // TODO: set keep.parse.data + + let mut status: libr::ParseStatus = libr::ParseStatus_PARSE_NULL; + + let (text, srcfile) = match input { + ParseInput::Text(text) => (as_parse_text(text), RObject::null()), + ParseInput::SrcFile(srcfile) => (srcref::srcfile_lines(srcfile.sexp)?, srcfile.clone()), + }; let result: RObject = - try_catch(|| libr::R_ParseVector(r_code, -1, &mut ps, libr::R_NilValue).into())?; + try_catch(|| libr::R_ParseVector(text.sexp, -1, &mut status, srcfile.sexp).into())?; - match ps { - libr::ParseStatus_PARSE_OK => Ok(ParseResult::Complete(result.sexp)), + match status { + libr::ParseStatus_PARSE_OK => Ok(ParseResult::Complete(result)), libr::ParseStatus_PARSE_INCOMPLETE => Ok(ParseResult::Incomplete), libr::ParseStatus_PARSE_ERROR => Err(crate::Error::ParseSyntaxError { message: CStr::from_ptr(libr::get(libr::R_ParseErrorMsg).as_ptr()) @@ -91,9 +103,9 @@ pub fn parse_status(code: &str) -> crate::Result { line: libr::get(libr::R_ParseError) as i32, }), _ => { - // should not get here + // Should not get here Err(crate::Error::ParseError { - code: code.to_string(), + code: parse_input_as_string(input).unwrap_or(String::from("Conversion error")), message: String::from("Unknown parse error"), }) }, @@ -101,26 +113,61 @@ pub fn parse_status(code: &str) -> crate::Result { } } +pub fn as_parse_text(text: &str) -> RObject { + unsafe { + let mut protect = RProtect::new(); + let input = r_string!(convert_line_endings(text, LineEnding::Posix), &mut protect); + input.into() + } +} + +fn parse_input_as_string<'a>(input: &ParseInput<'a>) -> crate::Result { + Ok(match input { + ParseInput::Text(text) => text.to_string(), + ParseInput::SrcFile(srcfile) => { + let lines = srcref::srcfile_lines(srcfile.sexp)?; + let lines = unsafe { CharacterVector::new(lines)? }; + + lines + .iter() + .map(|x| x.unwrap_or(String::from("NA"))) + .join("\n") + }, + }) +} + #[cfg(test)] mod tests { use crate::assert_match; + use crate::parse::parse_input_as_string; + use crate::parse::ParseInput; use crate::parse_status; + use crate::r_length; use crate::r_stringify; use crate::r_symbol; use crate::r_test; use crate::r_typeof; + use crate::srcref; use crate::ParseResult; #[test] fn test_parse_status() { r_test! { - // complete assert_match!( - parse_status("force(42)"), + parse_status(&ParseInput::Text("")), + Ok(ParseResult::Complete(out)) => { + assert_eq!(r_typeof(out.sexp), libr::EXPRSXP as u32); + assert_eq!(r_length(out.sexp), 0); + } + ); + + // Complete + assert_match!( + parse_status(&ParseInput::Text("force(42)")), Ok(ParseResult::Complete(out)) => { - assert_eq!(r_typeof(out), libr::EXPRSXP as u32); + assert_eq!(r_typeof(out.sexp), libr::EXPRSXP as u32); - let call = libr::VECTOR_ELT(out, 0); + let call = libr::VECTOR_ELT(out.sexp, 0); assert_eq!(r_typeof(call), libr::LANGSXP as u32); assert_eq!(libr::Rf_xlength(call), 2); assert_eq!(libr::CAR(call), r_symbol!("force")); @@ -131,21 +178,21 @@ mod tests { } ); - // incomplete + // Incomplete assert_match!( - parse_status("force(42"), + parse_status(&ParseInput::Text("force(42")), Ok(ParseResult::Incomplete) ); - // error + // Error assert_match!( - parse_status("42 + _"), + parse_status(&ParseInput::Text("42 + _")), Err(_) => {} ); // "normal" syntax error assert_match!( - parse_status("1+1\n*42"), + parse_status(&ParseInput::Text("1+1\n*42")), Err(crate::Error::ParseSyntaxError {message, line}) => { assert!(message.contains("unexpected")); assert_eq!(line, 2); @@ -154,21 +201,37 @@ mod tests { // CRLF in the code string, like a file with CRLF line endings assert_match!( - parse_status("x<-\r\n1\r\npi"), + parse_status(&ParseInput::Text("x<-\r\n1\r\npi")), Ok(ParseResult::Complete(out)) => { - assert_eq!(r_typeof(out), libr::EXPRSXP as u32); - assert_eq!(r_stringify(out, "").unwrap(), "expression(x <- 1, pi)"); + assert_eq!(r_typeof(out.sexp), libr::EXPRSXP as u32); + assert_eq!(r_stringify(out.sexp, "").unwrap(), "expression(x <- 1, pi)"); } ); // CRLF inside a string literal in the code assert_match!( - parse_status(r#"'a\r\nb'"#), + parse_status(&ParseInput::Text(r#"'a\r\nb'"#)), Ok(ParseResult::Complete(out)) => { - assert_eq!(r_typeof(out), libr::EXPRSXP as u32); - assert_eq!(r_stringify(out, "").unwrap(), r#"expression("a\r\nb")"#); + assert_eq!(r_typeof(out.sexp), libr::EXPRSXP as u32); + assert_eq!(r_stringify(out.sexp, "").unwrap(), r#"expression("a\r\nb")"#); } ); } } + + #[test] + fn test_parse_input_as_string() { + r_test! { + assert_eq!( + parse_input_as_string(&ParseInput::Text("foo\nbar")).unwrap(), + "foo\nbar" + ); + + let input = srcref::new_srcfile_virtual("foo\nbar").unwrap(); + assert_eq!( + parse_input_as_string(&ParseInput::SrcFile(input)).unwrap(), + "foo\nbar" + ); + } + } } diff --git a/crates/harp/src/parser/mod.rs b/crates/harp/src/parser/mod.rs new file mode 100644 index 000000000..12b27efa2 --- /dev/null +++ b/crates/harp/src/parser/mod.rs @@ -0,0 +1 @@ +pub mod srcref; diff --git a/crates/harp/src/parser/srcref.rs b/crates/harp/src/parser/srcref.rs new file mode 100644 index 000000000..a492cb882 --- /dev/null +++ b/crates/harp/src/parser/srcref.rs @@ -0,0 +1,185 @@ +// +// srcref.rs +// +// Copyright (C) 2024 Posit Software, PBC. All rights reserved. +// +// + +use core::f64; + +use anyhow::anyhow; +use libr::SEXP; +use stdext::unwrap; + +use crate::exec::RFunction; +use crate::exec::RFunctionExt; +use crate::vector::IntegerVector; +use crate::vector::Vector; +use crate::RObject; + +/// Structured representation of `srcref` integer vectors +/// 0-based offsets. +#[derive(Debug)] +pub struct SrcRef { + /// Lines and virtual lines may differ if a `#line` directive is used in code: + /// the former just counts actual lines, the latter respects the directive. + /// `line` corresponds to `line_parsed` in the original base R srcref vector. + pub line: std::ops::Range, + pub line_virtual: std::ops::Range, + + /// Bytes and columns may be different due to multibyte characters. + pub column: std::ops::Range, + pub column_byte: std::ops::Range, +} + +// Takes user-facing object as input. The srcrefs are retrieved from +// attributes. +impl RObject { + pub fn srcrefs(&self) -> anyhow::Result> { + let srcref = unwrap!(self.attr("srcref"), None => { + return Err(anyhow!("Can't find `srcref` attribute")); + }); + + unsafe { + crate::List::new(srcref.sexp)? + .iter() + .map(|x| SrcRef::try_from(RObject::view(x))) + .collect() + } + } +} + +// Takes individual `srcref` attribute as input +impl TryFrom for SrcRef { + type Error = anyhow::Error; + + fn try_from(value: RObject) -> anyhow::Result { + crate::r_assert_type(value.sexp, &[libr::INTSXP])?; + crate::r_assert_capacity(value.sexp, 6)?; + + let value = unsafe { IntegerVector::new(value)? }; + + // The srcref values are adjusted to produce a `[ )` range as expected + // by `std::ops::Range` that counts from 0. This is in contrast to the + // ranges in `srcref` vectors which are 1-based `[ ]`. + + // Change from 1-based to 0-based counting + let adjust_start = |i| (i - 1) as usize; + + // Change from 1-based to 0-based counting (-1) and make it an exclusive + // boundary (+1). So essentially a no-op. + let adjust_end = |i| i as usize; + + let line_start = adjust_start(value.get_value(0)?); + let column_start = adjust_start(value.get_value(4)?); + let column_byte_start = adjust_start(value.get_value(1)?); + + let line_end = adjust_end(value.get_value(2)?); + let column_end = adjust_end(value.get_value(5)?); + let column_byte_end = adjust_end(value.get_value(3)?); + + let line = std::ops::Range { + start: line_start, + end: line_end, + }; + + let line_parsed = if unsafe { value.len() >= 8 } { + let line_parsed_start = adjust_start(value.get_value(6)?); + let line_parsed_end = adjust_end(value.get_value(7)?); + std::ops::Range { + start: line_parsed_start, + end: line_parsed_end, + } + } else { + line.clone() + }; + + let column = std::ops::Range { + start: column_start, + end: column_end, + }; + + let column_byte = std::ops::Range { + start: column_byte_start, + end: column_byte_end, + }; + + Ok(Self { + line: line_parsed, + line_virtual: line, + column, + column_byte, + }) + } +} + +/// Creates the same sort of srcfile object as with `parse(text = )`. +/// Takes code as an R string containing newlines, or as a R vector of lines. +pub fn new_srcfile_virtual(text: &str) -> crate::Result { + let input = crate::as_parse_text(text); + RFunction::new("base", "srcfilecopy") + .param("filename", "") + .param("lines", input) + .call() +} + +pub fn srcfile_lines(srcfile: SEXP) -> crate::Result { + RFunction::new("base", "getSrcLines") + .add(srcfile) + .param("first", 1) + .param("last", f64::INFINITY) + .call() +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use crate::srcref::SrcRef; + use crate::test::r_test; + + #[test] + fn test_srcref() { + r_test(|| { + let exprs = crate::parse_exprs_with_srcrefs("foo\n\nś\nbar(\n\n)").unwrap(); + let srcrefs: Vec = exprs.srcrefs().unwrap(); + let foo = &srcrefs[0]; + let utf8 = &srcrefs[1]; + let bar = &srcrefs[2]; + + assert_eq!(foo.line, Range { start: 0, end: 1 }); + assert_eq!(foo.line_virtual, Range { start: 0, end: 1 }); + assert_eq!(foo.column, Range { start: 0, end: 3 }); + assert_eq!(foo.column_byte, Range { start: 0, end: 3 }); + + // `column_byte` is different because the character takes up two bytes + assert_eq!(utf8.line, Range { start: 2, end: 3 }); + assert_eq!(utf8.line_virtual, Range { start: 2, end: 3 }); + assert_eq!(utf8.column, Range { start: 0, end: 1 }); + assert_eq!(utf8.column_byte, Range { start: 0, end: 2 }); + + // Ends on different lines + assert_eq!(bar.line, Range { start: 3, end: 6 }); + assert_eq!(bar.line_virtual, Range { start: 3, end: 6 }); + assert_eq!(bar.column, Range { start: 0, end: 1 }); + assert_eq!(bar.column_byte, Range { start: 0, end: 1 }); + }) + } + + #[test] + fn test_srcref_line_directive() { + r_test(|| { + let exprs = crate::parse_exprs_with_srcrefs("foo\n#line 5\nbar").unwrap(); + let srcrefs: Vec = exprs.srcrefs().unwrap(); + let foo = &srcrefs[0]; + let bar = &srcrefs[1]; + + assert_eq!(foo.line, Range { start: 0, end: 1 }); + assert_eq!(foo.line_virtual, Range { start: 0, end: 1 }); + + // Custom line via directive + assert_eq!(bar.line, Range { start: 2, end: 3 }); + assert_eq!(bar.line_virtual, Range { start: 4, end: 5 }); + }) + } +} diff --git a/crates/harp/src/utils.rs b/crates/harp/src/utils.rs index d2410180a..f5a2560ec 100644 --- a/crates/harp/src/utils.rs +++ b/crates/harp/src/utils.rs @@ -138,8 +138,8 @@ pub fn r_assert_type(object: SEXP, expected: &[u32]) -> Result { Ok(actual) } -pub unsafe fn r_assert_capacity(object: SEXP, required: usize) -> Result { - let actual = Rf_xlength(object) as usize; +pub fn r_assert_capacity(object: SEXP, required: usize) -> Result { + let actual = unsafe { Rf_xlength(object) } as usize; if actual < required { return Err(Error::UnexpectedLength(actual, required)); } diff --git a/crates/harp/src/vector/mod.rs b/crates/harp/src/vector/mod.rs index a969d04ff..25fca1193 100644 --- a/crates/harp/src/vector/mod.rs +++ b/crates/harp/src/vector/mod.rs @@ -69,9 +69,7 @@ pub trait Vector { } fn get(&self, index: isize) -> Result> { - unsafe { - r_assert_capacity(self.data(), index as usize)?; - } + r_assert_capacity(self.data(), index as usize)?; Ok(self.get_unchecked(index)) }