Skip to content

Commit

Permalink
adapt to recent changes on main
Browse files Browse the repository at this point in the history
Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke committed Feb 27, 2024
1 parent 1cde690 commit d6548a4
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 12 deletions.
5 changes: 5 additions & 0 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,12 @@ def load_sheet_eager(
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> pa.RecordBatch:
"""Loads a sheet by name if a string is passed or by index if an integer is passed.
The sheet is loaded eagerly.
See `load_sheet_by_idx` and `load_sheet_by_name` for parameter documentation.
"""
return (
Expand All @@ -248,6 +251,7 @@ def load_sheet_eager(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
if isinstance(idx_or_name, int)
else self._reader.load_sheet_by_name_eager(
Expand All @@ -257,6 +261,7 @@ def load_sheet_eager(
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
)
)

Expand Down
12 changes: 12 additions & 0 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> pa.RecordBatch: ...
def load_sheet_by_idx(
self,
Expand All @@ -62,6 +63,17 @@ class _ExcelReader:
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> _ExcelSheet: ...
def load_sheet_by_idx_eager(
self,
idx: int,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | None = None,
) -> pa.RecordBatch: ...
@property
def sheet_names(self) -> list[str]: ...

Expand Down
45 changes: 35 additions & 10 deletions src/types/excelreader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use crate::types::excelsheet::sheet_column_names_from_header_and_range;
use crate::utils::arrow::arrow_schema_from_column_names_and_range;
use crate::utils::schema::get_schema_sample_rows;

use super::excelsheet::record_batch_from_data_and_schema;
use super::excelsheet::{record_batch_from_data_and_schema, SelectedColumns};
use super::{
excelsheet::{Header, Pagination},
ExcelSheet,
Expand All @@ -28,6 +28,10 @@ pub(crate) struct ExcelReader {
}

impl ExcelReader {
fn build_selected_columns(use_columns: Option<&PyList>) -> PyResult<SelectedColumns> {
use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()
}

// NOTE: Not implementing TryFrom here, because we're aren't building the file from the passed
// string, but rather from the file pointed by it. Semantically, try_from_path is clearer
pub(crate) fn try_from_path(path: &str) -> FastExcelResult<Self> {
Expand All @@ -47,6 +51,7 @@ impl ExcelReader {
pagination: Pagination,
header: Header,
sample_rows: Option<usize>,
selected_columns: &SelectedColumns,
) -> FastExcelResult<RecordBatch> {
let column_names = sheet_column_names_from_header_and_range(&header, &data);

Expand All @@ -68,6 +73,7 @@ impl ExcelReader {
&column_names,
offset,
schema_sample_rows,
selected_columns,
)
.with_context(|| "could not build arrow schema")?;

Expand Down Expand Up @@ -111,7 +117,7 @@ impl ExcelReader {

let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
let selected_columns = Self::build_selected_columns(use_columns)?;
ExcelSheet::try_new(
name,
range,
Expand All @@ -131,6 +137,7 @@ impl ExcelReader {
skip_rows = 0,
n_rows = None,
schema_sample_rows = 1_000,
use_columns = None,
))]
#[allow(clippy::too_many_arguments)]
pub fn load_sheet_by_name_eager(
Expand All @@ -141,6 +148,7 @@ impl ExcelReader {
skip_rows: usize,
n_rows: Option<usize>,
schema_sample_rows: Option<usize>,
use_columns: Option<&PyList>,
py: Python<'_>,
) -> PyResult<PyObject> {
let range = self
Expand All @@ -152,9 +160,16 @@ impl ExcelReader {

let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let rb = ExcelReader::load_sheet_eager(range, pagination, header, schema_sample_rows)
.with_context(|| "could not load sheet eagerly")
.into_pyresult()?;
let selected_columns = Self::build_selected_columns(use_columns)?;
let rb = ExcelReader::load_sheet_eager(
range,
pagination,
header,
schema_sample_rows,
&selected_columns,
)
.with_context(|| "could not load sheet eagerly")
.into_pyresult()?;
rb.to_pyarrow(py)
}

Expand Down Expand Up @@ -205,7 +220,8 @@ impl ExcelReader {

let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let selected_columns = use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | None, got {use_columns:?}")).into_pyresult()?;
let selected_columns = Self::build_selected_columns(use_columns)?;

ExcelSheet::try_new(
name,
range,
Expand All @@ -225,6 +241,7 @@ impl ExcelReader {
skip_rows = 0,
n_rows = None,
schema_sample_rows = 1_000,
use_columns = None,
))]
#[allow(clippy::too_many_arguments)]
pub fn load_sheet_by_idx_eager(
Expand All @@ -235,23 +252,31 @@ impl ExcelReader {
skip_rows: usize,
n_rows: Option<usize>,
schema_sample_rows: Option<usize>,
use_columns: Option<&PyList>,
py: Python<'_>,
) -> PyResult<PyObject> {
let range = self
.sheets
.worksheet_range_at(idx)
// Returns Option<Result<Range<Data>, Self::Error>>, so we convert the Option into a
// SheetNotFoundError and unwrap it
.ok_or_else(|| FastExcelErrorKind::SheetNotFound(SheetIdxOrName::Idx(idx)).into())
.ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into())
.into_pyresult()?
// And here, we convert the calamine error in an owned error and unwrap it
.map_err(|err| FastExcelErrorKind::CalamineError(err).into())
.into_pyresult()?;
let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?;
let rb = ExcelReader::load_sheet_eager(range, pagination, header, schema_sample_rows)
.with_context(|| "could not load sheet eagerly")
.into_pyresult()?;
let selected_columns = Self::build_selected_columns(use_columns)?;
let rb = ExcelReader::load_sheet_eager(
range,
pagination,
header,
schema_sample_rows,
&selected_columns,
)
.with_context(|| "could not load sheet eagerly")
.into_pyresult()?;
rb.to_pyarrow(py)
}
}
3 changes: 1 addition & 2 deletions src/types/excelsheet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,7 @@ impl TryFrom<&ExcelSheet> for Schema {
sheet.data(),
&sheet.available_columns,
sheet.offset(),
// If sample_rows is higher than the sheet's limit, use the limit instead
std::cmp::min(sample_rows, sheet.limit()),
sheet.schema_sample_rows(),
&sheet.selected_columns,
)
}
Expand Down

0 comments on commit d6548a4

Please sign in to comment.