Skip to content

Commit

Permalink
feat: read excel from bytes content (#192)
Browse files Browse the repository at this point in the history
* test: rename

* chore: ignore vscode folder

* feat: read from bytes

* fix: not path in rust

* refactor: switch to owned type

* perf: way better
  • Loading branch information
PrettyWood authored Feb 28, 2024
1 parent fb3a582 commit c5ab088
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 21 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ __pycache__
.python-version
.venv
docs
.vscode
.idea
.benchmarks
11 changes: 6 additions & 5 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from pathlib import Path

import pandas as pd
import polars as pl

from os.path import expanduser
from pathlib import Path

import pyarrow as pa

Expand Down Expand Up @@ -238,12 +237,14 @@ def __repr__(self) -> str:
return self._reader.__repr__()


def read_excel(path: Path | str) -> ExcelReader:
def read_excel(source: Path | str | bytes) -> ExcelReader:
"""Opens and loads an excel file.
:param path: The path to the file
:param source: The path to a file or its content as bytes
"""
return ExcelReader(_read_excel(expanduser(path)))
if isinstance(source, (str, Path)):
source = expanduser(source)
return ExcelReader(_read_excel(source))


__all__ = (
Expand Down
2 changes: 1 addition & 1 deletion python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class _ExcelReader:
@property
def sheet_names(self) -> list[str]: ...

def read_excel(path: str) -> _ExcelReader:
def read_excel(source: str | bytes) -> _ExcelReader:
"""Reads an excel file and returns an ExcelReader"""

__version__: str
Expand Down
6 changes: 6 additions & 0 deletions python/tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
from utils import path_for_fixture


def test_read_excel_bad_type() -> None:
expected_message = "source must be a string or bytes"
with pytest.raises(fastexcel.InvalidParametersError, match=expected_message):
fastexcel.read_excel(42) # type: ignore[arg-type]


def test_does_not_exist() -> None:
expected_message = """calamine error: Cannot detect file format
Context:
Expand Down
29 changes: 26 additions & 3 deletions python/tests/test_fastexcel.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from utils import path_for_fixture


def test_single_sheet_to_pandas():
def test_single_sheet():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
assert excel_reader.sheet_names == ["January"]
sheet_by_name = excel_reader.load_sheet("January")
Expand All @@ -31,7 +31,30 @@ def test_single_sheet_to_pandas():
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_single_sheet_with_types_to_pandas():
def test_single_sheet_bytes():
with open(path_for_fixture("fixture-single-sheet.xlsx"), "rb") as f:
excel_reader = fastexcel.read_excel(f.read())
assert excel_reader.sheet_names == ["January"]
sheet_by_name = excel_reader.load_sheet("January")
sheet_by_idx = excel_reader.load_sheet(0)

# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "January"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 2

expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_single_sheet_with_types():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
assert excel_reader.sheet_names == ["Sheet1"]

Expand Down Expand Up @@ -67,7 +90,7 @@ def test_single_sheet_with_types_to_pandas():
)


def test_multiple_sheets_to_pandas():
def test_multiple_sheets():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
assert excel_reader.sheet_names == ["January", "February", "With unnamed columns"]

Expand Down
18 changes: 14 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,22 @@ use types::{ExcelReader, ExcelSheet};

/// Reads an excel file and returns an object allowing to access its sheets and a bit of metadata
#[pyfunction]
fn read_excel(path: &str) -> PyResult<ExcelReader> {
fn read_excel(source: &PyAny) -> PyResult<ExcelReader> {
use py_errors::IntoPyResult;

ExcelReader::try_from_path(path)
.with_context(|| format!("could not load excel file at {path}"))
.into_pyresult()
if let Ok(path) = source.extract::<&str>() {
ExcelReader::try_from_path(path)
.with_context(|| format!("could not load excel file at {path}"))
.into_pyresult()
} else if let Ok(bytes) = source.extract::<&[u8]>() {
ExcelReader::try_from(bytes)
.with_context(|| "could not load excel file for those bytes")
.into_pyresult()
} else {
Err(py_errors::InvalidParametersError::new_err(
"source must be a string or bytes",
))
}
}

// Taken from pydantic-core:
Expand Down
68 changes: 60 additions & 8 deletions src/types/excelreader.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,58 @@
use std::{fs::File, io::BufReader};
use std::{
fs::File,
io::{BufReader, Cursor},
};

use calamine::{open_workbook_auto, Reader, Sheets};
use calamine::{
open_workbook_auto, open_workbook_auto_from_rs, Data, Error, Range, Reader, Sheets,
};
use pyo3::{pyclass, pymethods, PyAny, PyResult};

use crate::error::{
py_errors::IntoPyResult, ErrorContext, FastExcelErrorKind, FastExcelResult, IdxOrName,
py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult,
IdxOrName,
};

use super::{
excelsheet::{Header, Pagination},
ExcelSheet,
};

enum ExcelSheets {
File(Sheets<BufReader<File>>),
Bytes(Sheets<Cursor<Vec<u8>>>),
}

impl ExcelSheets {
fn worksheet_range(&mut self, name: &str) -> Result<Range<Data>, Error> {
match self {
Self::File(sheets) => sheets.worksheet_range(name),
Self::Bytes(sheets) => sheets.worksheet_range(name),
}
}

fn worksheet_range_at(&mut self, idx: usize) -> Option<Result<Range<Data>, Error>> {
match self {
Self::File(sheets) => sheets.worksheet_range_at(idx),
Self::Bytes(sheets) => sheets.worksheet_range_at(idx),
}
}

#[allow(dead_code)]
fn sheet_names(&self) -> Vec<String> {
match self {
Self::File(sheets) => sheets.sheet_names(),
Self::Bytes(sheets) => sheets.sheet_names(),
}
}
}

#[pyclass(name = "_ExcelReader")]
pub(crate) struct ExcelReader {
sheets: Sheets<BufReader<File>>,
sheets: ExcelSheets,
#[pyo3(get)]
sheet_names: Vec<String>,
path: String,
source: String,
}

impl ExcelReader {
Expand All @@ -29,17 +64,34 @@ impl ExcelReader {
.with_context(|| format!("Could not open workbook at {path}"))?;
let sheet_names = sheets.sheet_names().to_owned();
Ok(Self {
sheets,
sheets: ExcelSheets::File(sheets),
sheet_names,
source: path.to_owned(),
})
}
}

impl TryFrom<&[u8]> for ExcelReader {
type Error = FastExcelError;

fn try_from(bytes: &[u8]) -> Result<Self, Self::Error> {
let cursor = Cursor::new(bytes.to_vec());
let sheets = open_workbook_auto_from_rs(cursor)
.map_err(|err| FastExcelErrorKind::CalamineError(err).into())
.with_context(|| "Could not open workbook from bytes")?;
let sheet_names = sheets.sheet_names().to_owned();
Ok(Self {
sheets: ExcelSheets::Bytes(sheets),
sheet_names,
path: path.to_owned(),
source: "bytes".to_owned(),
})
}
}

#[pymethods]
impl ExcelReader {
pub fn __repr__(&self) -> String {
format!("ExcelReader<{}>", &self.path)
format!("ExcelReader<{}>", &self.source)
}

#[pyo3(signature = (
Expand Down

0 comments on commit c5ab088

Please sign in to comment.