Skip to content

Commit

Permalink
Merge pull request #508 from posit-dev/feature/parse-data
Browse files Browse the repository at this point in the history
Add `ParseData` type
  • Loading branch information
lionel- authored Sep 17, 2024
2 parents 47ced97 + 80f7d40 commit b7985c6
Show file tree
Hide file tree
Showing 20 changed files with 704 additions and 171 deletions.
11 changes: 10 additions & 1 deletion crates/ark/src/variables/variable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ use harp::vector::names::Names;
use harp::vector::CharacterVector;
use harp::vector::IntegerVector;
use harp::vector::Vector;
use harp::TableDim;
use itertools::Itertools;
use libr::*;
use stdext::local;
Expand Down Expand Up @@ -96,7 +97,15 @@ impl WorkspaceVariableDisplayValue {
}

fn from_data_frame(value: SEXP) -> Self {
let dim = harp::df_dim(value);
let dim = match unsafe { harp::df_dim(value) } {
Ok(dim) => dim,
// FIXME: Needs more type safety
Err(_) => TableDim {
num_rows: -1,
num_cols: -1,
},
};

let class = match r_classes(value) {
None => String::from(""),
Some(classes) => match classes.get_unchecked(0) {
Expand Down
47 changes: 3 additions & 44 deletions crates/harp/harp-macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ fn invalid_extern(stream: impl ToTokens) -> ! {
);
}

// TODO: Can we move more of this to the `Vector` trait?

#[proc_macro_attribute]
pub fn vector(_attr: TokenStream, item: TokenStream) -> TokenStream {
// TODO: How do we parse an attribute?
Expand Down Expand Up @@ -61,50 +63,7 @@ pub fn vector(_attr: TokenStream, item: TokenStream) -> TokenStream {
impl std::convert::TryFrom<libr::SEXP> for #ident {
type Error = crate::error::Error;
fn try_from(value: libr::SEXP) -> Result<Self, Self::Error> {
unsafe { Self::new(value) }
}
}

pub struct VectorIter<'a> {
data: &'a #ident,
index: isize,
size: isize,
}

impl<'a> std::iter::Iterator for VectorIter<'a> {
type Item = Option<<#ident as Vector>::Type>;

fn next(&mut self) -> Option<Self::Item> {
if self.index == self.size {
return None;
}

// TODO: having the iterator to call get_unchecked()
// feels wrong because down the line this will
// need to call REAL_ELT(), STRING_ELT() etc ...
// which has some extra cost one the R side
//
// This is the opposite problem of calling
// DATAPTR() which gives a contiguous array
// but has to materialize for it which might be
// costly for ALTREP() vectors
//
// The compromise that was used in cpp11 is to use
// GET_REGION and work on partial materialization
let item = unsafe { self.data.get_unchecked(self.index) };
self.index = self.index + 1;
Some(item)
}
}

impl #ident {
pub fn iter(&self) -> VectorIter<'_> {
let size = unsafe { self.len() as isize };
VectorIter {
data: self,
index: 0,
size: size,
}
super::try_r_vector_from_r_sexp(value)
}
}

Expand Down
178 changes: 178 additions & 0 deletions crates/harp/src/data_frame.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
use crate::vector::Vector;
use crate::List;
use crate::RObject;

/// Typed data frame
///
/// Type guarantees:
/// - Storage type is list.
/// - Class is `"data.frame"`.
/// - All columns are vectors and the same size as the data frame.
#[derive(Debug)]
pub struct DataFrame {
pub list: List,
pub obj: RObject,

pub names: Vec<String>,
pub nrow: usize,
pub ncol: usize,
}

impl DataFrame {
pub fn new(sexp: libr::SEXP) -> harp::Result<Self> {
let list = List::new(sexp)?;
harp::assert_class(sexp, "data.frame")?;

let dim = unsafe { harp::df_dim(list.obj.sexp) }?;
let nrow = dim.num_rows as usize;
let ncol = list.obj.length() as usize;

let Some(names) = list.obj.names() else {
return Err(harp::anyhow!("Data frame must have names"));
};
let Ok(names) = harp::assert_non_optional(names) else {
return Err(harp::anyhow!("Data frame can't have missing names"));
};

// Validate columns
for obj in list.iter() {
let obj = RObject::view(obj);

if unsafe { libr::Rf_isVector(obj.sexp) == 0 } {
return Err(harp::anyhow!("Data frame column must be a vector"));
}

if obj.length() as usize != nrow {
return Err(harp::anyhow!(
"Data frame column must be the same size as the number of rows"
));
}
}

// SAFETY: Protected by `list`
let obj = RObject::view(sexp);

Ok(Self {
list,
obj,
names,
nrow,
ncol,
})
}

pub fn col(&self, name: &str) -> harp::Result<RObject> {
let Some(idx) = self.names.iter().position(|n| n == name) else {
return Err(harp::Error::MissingColumnError { name: name.into() });
};

self.list
// FIXME: `get()` should take `usize`
.get(idx as isize)?
.ok_or_else(|| harp::unreachable!("missing column"))
}
}

#[cfg(test)]
mod tests {
use crate::assert_match;
use crate::r_alloc_integer;
use crate::r_chr_poke;
use crate::r_list_poke;
use crate::test::r_test;
use crate::vector::Vector;
use crate::DataFrame;
use crate::List;
use crate::RObject;

#[test]
fn test_data_frame_structure() {
r_test(|| {
let df = harp::parse_eval_base("data.frame(x = 1:2, y = 3:4)").unwrap();
let df = DataFrame::new(df.sexp).unwrap();

assert_match!(df.list, List { .. });
assert_match!(df.obj, RObject { .. });

assert_eq!(df.names, vec![String::from("x"), String::from("y")]);
assert_eq!(df.nrow, 2);
assert_eq!(df.ncol, 2);
})
}

#[test]
fn test_data_frame_no_names() {
r_test(|| {
let df = harp::parse_eval_base("data.frame(x = 1:2, y = 3:4)").unwrap();
df.set_attr("names", RObject::null().sexp);
let df = DataFrame::new(df.sexp);

assert_match!(df, harp::Result::Err(err) => {
assert!(format!("{err}").contains("must have names"))
});
})
}

#[test]
fn test_data_frame_bad_names() {
r_test(|| {
let df = harp::parse_eval_base("data.frame(x = 1:2, y = 3:4)").unwrap();
let nms = df.attr("names").unwrap();
unsafe {
r_chr_poke(nms.sexp, 0, libr::R_NaString);
}
let df = DataFrame::new(df.sexp);

assert_match!(df, harp::Result::Err(err) => {
assert!(format!("{err}").contains("missing names"))
});
})
}

#[test]
fn test_data_frame_bad_column_type() {
r_test(|| {
let df = harp::parse_eval_base("data.frame(x = 1:2, y = 3:4)").unwrap();
r_list_poke(df.sexp, 0, RObject::null().sexp);

let df = DataFrame::new(df.sexp);

assert_match!(df, harp::Result::Err(err) => {
assert!(format!("{err}").contains("must be a vector"))
});
})
}

#[test]
fn test_data_frame_bad_column_size() {
r_test(|| {
let df = harp::parse_eval_base("data.frame(x = 1:2, y = 3:4)").unwrap();
let bad_col = r_alloc_integer(3);
r_list_poke(df.sexp, 0, bad_col);

let df = DataFrame::new(df.sexp);

assert_match!(df, harp::Result::Err(err) => {
assert!(format!("{err}").contains("must be the same size"))
});
})
}

#[test]
fn test_data_frame_col() {
r_test(|| {
let df = harp::parse_eval_base("data.frame(x = 1:2, y = 3:4)").unwrap();
let df = DataFrame::new(df.sexp).unwrap();

let col_y = df.col("y").unwrap();
assert_eq!(col_y.sexp, df.list.get_value(1).unwrap().sexp);

assert_match!(df.col("foo"), harp::Result::Err(err) => {
assert_match!(err, harp::Error::MissingColumnError { ref name } => {
assert_eq!(name, "foo");
});
assert!(format!("{err}").contains("Can't find column `foo` in data frame"))
});
})
}
}
30 changes: 29 additions & 1 deletion crates/harp/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ pub enum Error {
UnsafeEvaluationError(String),
UnexpectedLength(usize, usize),
UnexpectedType(u32, Vec<u32>),
UnexpectedClass(Option<Vec<String>>, String),
ValueOutOfRange {
value: i64,
min: i64,
Expand All @@ -48,6 +49,9 @@ pub enum Error {
line: i32,
},
MissingValueError,
MissingColumnError {
name: String,
},
MissingBindingError {
name: String,
},
Expand Down Expand Up @@ -172,6 +176,18 @@ impl fmt::Display for Error {
)
},

Error::UnexpectedClass(actual, expected) => {
let actual = if let Some(actual) = actual {
actual.join("/")
} else {
String::from("_unclassed_")
};
write!(
f,
"Unexpected class for R object (expected {expected}; got {actual})",
)
},

Error::ValueOutOfRange { value, min, max } => {
write!(
f,
Expand Down Expand Up @@ -204,8 +220,12 @@ impl fmt::Display for Error {
write!(f, "{err:?}")
},

Error::MissingColumnError { name } => {
write!(f, "Can't find column `{name}` in data frame")
},

Error::MissingBindingError { name } => {
write!(f, "Can't find binding {name} in environment")
write!(f, "Can't find binding `{name}` in environment")
},

Error::OutOfMemory { size } => {
Expand All @@ -226,6 +246,14 @@ macro_rules! anyhow {
}}
}

#[macro_export]
macro_rules! unreachable {
($($rest: expr),*) => {{
let message = format!($($rest, )*);
harp::anyhow!("Internal error: {message}")
}}
}

pub fn as_result<T, E>(res: std::result::Result<T, E>) -> crate::Result<T>
where
E: std::fmt::Debug,
Expand Down
2 changes: 2 additions & 0 deletions crates/harp/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
pub mod attrib;
pub mod call;
pub mod data_frame;
pub mod environment;
pub mod environment_iter;
pub mod error;
Expand Down Expand Up @@ -39,6 +40,7 @@ pub mod vec_format;
pub mod vector;

// Reexport API
pub use data_frame::*;
pub use eval::*;
pub use object::*;
pub use parse::*;
Expand Down
Loading

0 comments on commit b7985c6

Please sign in to comment.