Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Csvxlsx file reader #664

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ class MyReferenceStyle(AuthorYearReferenceStyle):
("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.__repr__"),
("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.count"),
("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.index"),
("py:exc", "ParserError"),
]
intersphinx_mapping = {
"numpy": ("https://numpy.org/doc/stable/", None),
Expand Down
47 changes: 36 additions & 11 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ tomli = {version = "^2.0.1", python = "<3.11"}
tomli-w = "^1.0.0"
tqdm = "^4.66.2"
xarray = ">=2024.6,<2026.0"
openpyxl = "^3.1.5"

[tool.poetry.group.types.dependencies]
types-dataclasses = "^0.6.6"
Expand Down
2 changes: 2 additions & 0 deletions tests/core/data/garbage.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
a,b,c
1,2,"3
Binary file added tests/core/data/garbage.xlsx
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/core/data/reader_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
var1,var2
1,4
2,5
3,6
Binary file added tests/core/data/reader_test.xlsx
Binary file not shown.
71 changes: 70 additions & 1 deletion tests/core/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from logging import CRITICAL, DEBUG, INFO

import pytest
from pandas.errors import ParserError
from xarray import DataArray

from tests.conftest import log_check
Expand Down Expand Up @@ -93,7 +94,11 @@ def test_func():
],
)
def test_load_netcdf(shared_datadir, caplog, file, file_var, exp_err, expected_log):
"""Test the netdcf variable loader."""
"""Test the netdcf variable loader.

The tests here are dependent on the test_file_format_loader, so cannot be run
individually.
"""

from virtual_ecosystem.core.readers import load_netcdf

Expand All @@ -105,6 +110,70 @@ def test_load_netcdf(shared_datadir, caplog, file, file_var, exp_err, expected_l
log_check(caplog, expected_log)


@pytest.mark.parametrize(
argnames=["file", "file_var", "exp_err", "expected_log"],
argvalues=[
(
"not_there.csv",
"irrelevant",
pytest.raises(FileNotFoundError),
((CRITICAL, "Data file not found"),),
),
(
"garbage.csv",
"irrelevant",
pytest.raises(ParserError),
((CRITICAL, "Could not load data from"),),
),
(
"reader_test.csv",
"missing",
pytest.raises(KeyError),
((CRITICAL, "Variable missing not found in"),),
),
(
"reader_test.csv",
"var1",
does_not_raise(),
(),
),
(
"garbage.xlsx",
"irrelevant",
pytest.raises(Exception),
((CRITICAL, "Unidentified exception opening"),),
),
(
"reader_test.xlsx",
"missing",
pytest.raises(KeyError),
((CRITICAL, "Variable missing not found in"),),
),
(
"reader_test.xlsx",
"var1",
does_not_raise(),
(),
),
],
)
def test_load_dataframe(shared_datadir, caplog, file, file_var, exp_err, expected_log):
"""Test the netdcf variable loader.

The tests here are dependent on the test_file_format_loader, so cannot be run
individually.
"""

from virtual_ecosystem.core.readers import load_from_dataframe

with exp_err:
darray = load_from_dataframe(shared_datadir / file, file_var)
assert isinstance(darray, DataArray)

# Check the error reports
log_check(caplog, expected_log)


@pytest.mark.parametrize(
argnames=[
"filename",
Expand Down
57 changes: 56 additions & 1 deletion virtual_ecosystem/core/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def new_function_to_load_tif_data(...):
from collections.abc import Callable
from pathlib import Path

from pandas import read_csv, read_excel
from pandas.errors import ParserError
from xarray import DataArray, load_dataset

from virtual_ecosystem.core.logger import LOGGER
Expand All @@ -55,7 +57,7 @@ def new_function_to_load_tif_data(...):
"""


def register_file_format_loader(file_types: tuple[str]) -> Callable:
def register_file_format_loader(file_types: tuple[str, ...]) -> Callable:
"""Adds a data loader function to the data loader registry.

This decorator is used to register a function that loads data from a given file type
Expand Down Expand Up @@ -134,6 +136,59 @@ def load_netcdf(file: Path, var_name: str) -> DataArray:
return dataset[var_name]


@register_file_format_loader(
file_types=(
".csv",
".xlsx",
)
)
def load_from_dataframe(file: Path, var_name: str) -> DataArray:
"""Loads a DataArray from a data frame format.

Args:
file: A Path for a csv or excel file containing the variable to load.
var_name: A string providing the name of the variables in this file.

Raises:
FileNotFoundError: with bad file path names.
ParserError: if the csv data is not readable.
Exception: if the excel data is not readable.

Note: the general exception is used because of the variety of exceptions that are
possible with read_excel.
"""

to_raise: Exception
file_type = file.suffix

# Determine dataframe file type & load file
try:
if file_type == ".csv":
dataset = read_csv(file)
else:
dataset = read_excel(file, engine="openpyxl")
except FileNotFoundError:
to_raise = FileNotFoundError(f"Data file not found: {file}")
LOGGER.critical(to_raise)
raise to_raise
except ParserError as err:
to_raise = ParserError(f"Could not load data from {file}: {err}.")
LOGGER.critical(to_raise)
raise to_raise
except Exception as err:
to_raise = Exception(f"Unidentified exception opening {file}: {err}")
LOGGER.critical(to_raise)
raise to_raise
Comment on lines +208 to +211
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's worth explicitly adding BadZipError - it's by far the most likely thing to happen and we can effectively report it as "Does not appear to be an Excel file". The general Exception can then handle anything else. I'd add a comment on that exception to say that openpyxl has a whole bunch of assorted failure modes for corrupt/non-excel Zip files and we're not going to try and track down an exhaustive list.

You can then add an actual zip file of (e.g.) a simple text file to test that failure mode.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do!!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a corrupted excel file which triggers that error - I think that should cover that failure case?


# Check if file var is in the dataset
if var_name not in dataset.columns:
to_raise = KeyError(f"Variable {var_name} not found in {file}")
LOGGER.critical(to_raise)
raise to_raise

return dataset[var_name].to_xarray()


def load_to_dataarray(
file: Path,
var_name: str,
Expand Down
Loading