Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle local data. #420

Merged
merged 10 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions editor/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
software-properties-common \
git \
python3-pip \
libmagic1 \
&& rm -rf /var/lib/apt/lists/*

COPY ./ /app/
Expand Down
2 changes: 2 additions & 0 deletions editor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ npm run cypress:run # Runs e2e tests in background

You can debug the tests in Github Actions because failed screenshots are uploaded as artifacts.

You may need to install [`libmagic`](https://pypi.org/project/python-magic).

# Create a custom component

Custom components are in `components/`.
Expand Down
7 changes: 7 additions & 0 deletions editor/core/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,10 @@ def str_to_mlc_data_type(data_type: str) -> mlc.DataType | None:
if data_type == str_data_type:
return mlc_data_type
return None


def mlc_to_str_data_type(data_type: str) -> mlc.DataType | None:
for str_data_type, mlc_data_type in zip(STR_DATA_TYPES, MLC_DATA_TYPES):
if data_type == mlc_data_type:
return str_data_type
return None
111 changes: 96 additions & 15 deletions editor/core/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import tempfile

from etils import epath
import magic
import pandas as pd
import requests

from .names import find_unique_name
from .path import get_resource_path
from .state import FileObject
from .state import FileSet

Expand All @@ -30,6 +32,8 @@ class FileTypes:
encoding_format="application/vnd.ms-excel",
extensions=["xls", "xlsx", "xlsm"],
)
GZIP = FileType(name="GZIP", encoding_format="application/gzip", extensions=["gz"])
JPEG = FileType(name="JPEG", encoding_format="image/jpeg", extensions=["json"])
JSON = FileType(
name="JSON", encoding_format="application/json", extensions=["json"]
)
Expand All @@ -43,19 +47,63 @@ class FileTypes:
encoding_format="application/vnd.apache.parquet",
extensions=["parquet"],
)
TAR = FileType(
name="Archive (TAR)",
encoding_format="application/x-tar",
extensions=["tar"],
)
TXT = FileType(
name="Text",
encoding_format="plain/text",
extensions=["txt"],
)
ZIP = FileType(
name="ZIP",
encoding_format="application/zip",
extensions=["zip"],
)


def _full_name(file_type: FileType):
return f"{file_type.name} ({file_type.encoding_format})"


FILE_TYPES: dict[str, FileType] = {
file_type.name: file_type
_full_name(file_type): file_type
for file_type in [
FileTypes.CSV,
FileTypes.EXCEL,
FileTypes.GZIP,
FileTypes.JPEG,
FileTypes.JSON,
FileTypes.JSONL,
FileTypes.PARQUET,
FileTypes.TAR,
FileTypes.TXT,
FileTypes.ZIP,
]
}

ENCODING_FORMATS: dict[str, FileType] = {
file_type.encoding_format: file_type for file_type in FILE_TYPES.values()
}


def name_to_code(file_type_name: str) -> str | None:
"""Maps names to the encoding format: Text => plain/text."""
for name, file_type in FILE_TYPES.items():
if file_type_name == name:
return file_type.encoding_format
return None


def code_to_index(encoding_format: str) -> int | None:
"""Maps the encoding format to its index in the list of keys: plain/text => 12."""
for i, file_type in enumerate(FILE_TYPES.values()):
if file_type.encoding_format == encoding_format:
return i
return None


def _sha256(content: bytes):
"""Computes the sha256 digest of the byte string."""
Expand Down Expand Up @@ -84,58 +132,91 @@ def download_file(url: str, file_path: epath.Path):
def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
"""Gets the df associated to the file."""
if file_type == FileTypes.CSV:
return pd.read_csv(file)
df = pd.read_csv(file)
elif file_type == FileTypes.EXCEL:
return pd.read_excel(file)
df = pd.read_excel(file)
elif file_type == FileTypes.JSON:
return pd.read_json(file)
df = pd.read_json(file)
elif file_type == FileTypes.JSONL:
return pd.read_json(file, lines=True)
df = pd.read_json(file, lines=True)
elif file_type == FileTypes.PARQUET:
return pd.read_parquet(file)
df = pd.read_parquet(file)
else:
raise NotImplementedError()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a message, something like "File format XXX no supported"? That's less cryptic, and people know what to ask for.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in another PR (1d2ccf9).

return df.infer_objects()


def guess_file_type(path: epath.Path) -> FileType | None:
mime = magic.from_file(path, mime=True)
return ENCODING_FORMATS.get(mime)

def file_from_url(file_type: FileType, url: str, names: set[str]) -> FileObject:

def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
"""Downloads locally and extracts the file information."""
file_path = hash_file_path(url)
if not file_path.exists():
download_file(url, file_path)
with file_path.open("rb") as file:
sha256 = _sha256(file.read())
df = get_dataframe(file_type, file_path).infer_objects()
file_type = guess_file_type(file_path)
df = get_dataframe(file_type, file_path)
return FileObject(
name=find_unique_name(names, url.split("/")[-1]),
description="",
content_url=url,
encoding_format=file_type.encoding_format,
sha256=sha256,
df=df,
folder=folder,
)


def file_from_upload(
file_type: FileType, file: io.BytesIO, names: set[str]
file: io.BytesIO, names: set[str], folder: epath.Path
) -> FileObject:
"""Uploads locally and extracts the file information."""
sha256 = _sha256(file.getvalue())
df = get_dataframe(file_type, file).infer_objects()
value = file.getvalue()
content_url = f"data/{file.name}"
sha256 = _sha256(value)
file_path = get_resource_path(content_url)
with file_path.open("wb") as f:
f.write(value)
file_type = guess_file_type(file_path)
df = get_dataframe(file_type, file)
return FileObject(
name=find_unique_name(names, file.name),
description="",
content_url=f"data/{file.name}",
content_url=content_url,
encoding_format=file_type.encoding_format,
sha256=sha256,
df=df,
folder=folder,
)


def file_from_form(type: str, names: set[str]) -> FileObject | FileSet:
def file_from_form(
type: str, names: set[str], folder: epath.Path
) -> FileObject | FileSet:
"""Creates a file based on manually added fields."""
if type == FILE_OBJECT:
return FileObject(name=find_unique_name(names, "file_object"))
return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
elif type == FILE_SET:
return FileSet(name=find_unique_name(names, "file_set"))
return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
else:
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")


def is_url(file: FileObject) -> bool:
return file.content_url and file.content_url.startswith("http")


def trigger_download(file: FileObject):
if is_url(file):
file_path = hash_file_path(file.content_url)
if not file_path.exists():
download_file(file.content_url, file_path)
else:
file_path = get_resource_path(file.content_url)
file_type = guess_file_type(file_path)
df = get_dataframe(file_type, file_path)
file.df = df
21 changes: 15 additions & 6 deletions editor/core/files_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
from unittest import mock

from etils import epath
import pandas as pd
import pytest

from .files import file_from_url
from .files import FileTypes
from core import files as files_module

FileTypes = files_module.FileTypes


def test_check_file_csv():
@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
def test_check_file_csv(guess_file_type):
del guess_file_type
csv = epath.Path(
# This is the hash path for "https://my.url".
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
Expand All @@ -18,10 +23,14 @@ def test_check_file_csv():
f.write("a,1\n")
f.write("b,2\n")
f.write("c,3\n")
file = file_from_url(FileTypes.CSV, "https://my.url", set())
file = files_module.file_from_url("https://my.url", set(), epath.Path())
pd.testing.assert_frame_equal(
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
)
# Fails with unknown encoding_format:


@mock.patch.object(files_module, "guess_file_type", return_value="unknown")
def test_check_file_unknown(guess_file_type):
del guess_file_type
with pytest.raises(NotImplementedError):
file_from_url("unknown", "https://my.url", set())
files_module.file_from_url("https://my.url", set(), epath.Path())
7 changes: 6 additions & 1 deletion editor/core/past_projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from core.constants import PAST_PROJECTS_PATH
from core.query_params import set_project
from core.state import CurrentProject
from core.state import FileObject
from core.state import get_cached_user
from core.state import Metadata

Expand All @@ -23,13 +24,17 @@ def _pickle_file(path: epath.Path) -> epath.Path:


def save_current_project():
metadata = st.session_state[Metadata]
metadata: Metadata = st.session_state[Metadata]
project = st.session_state.get(CurrentProject)
if not project:
project = CurrentProject.create_new()
st.session_state[CurrentProject] = project
project.path.mkdir(parents=True, exist_ok=True)
set_project(project)
# FileObjects should have a folder.
for resource in metadata.distribution:
if isinstance(resource, FileObject):
resource.folder = project.path
try:
pickled = pickle.dumps(metadata)
_pickle_file(project.path).write_bytes(pickled)
Expand Down
13 changes: 13 additions & 0 deletions editor/core/path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from etils import epath
import streamlit as st

from core.state import CurrentProject


def get_resource_path(content_url: str) -> epath.Path:
"""Gets the path on disk of the resource with `content_url`."""
project: CurrentProject = st.session_state[CurrentProject]
path = project.path / content_url
if not path.parent.exists():
path.parent.mkdir(parents=True, exist_ok=True)
return path
1 change: 1 addition & 0 deletions editor/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ class FileObject:
sha256: str | None = None
df: pd.DataFrame | None = None
rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
folder: epath.PathLike | None = None


@dataclasses.dataclass
Expand Down
2 changes: 1 addition & 1 deletion editor/cypress/e2e/renameDistribution.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ describe('Renaming of FileObjects/FileSets/RecordSets/Fields.', () => {
getBody().contains('Record Sets').click()
})
cy.contains('genders').click()
cy.contains('Edit fields details').click()
cy.contains('Edit fields details').click({force: true})
cy.contains('the-new-name')
})
})
4 changes: 2 additions & 2 deletions editor/cypress/e2e/uploadCsv.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ describe('Editor loads a local CSV as a resource', () => {
cy.contains('base.csv_record_set (2 fields)').click()
// We also see the fields with the proper types.
cy.get('[data-testid="stDataFrameResizable"]').contains("column1")
cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Text")
cy.get('[data-testid="stDataFrameResizable"]').contains("Text")
cy.get('[data-testid="stDataFrameResizable"]').contains("column2")
cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Integer")
cy.get('[data-testid="stDataFrameResizable"]').contains("Integer")

// I can edit the details of the fields.
cy.contains('Generating the dataset...').should('not.exist')
Expand Down
9 changes: 8 additions & 1 deletion editor/events/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import streamlit as st

from core.files import FILE_OBJECT
from core.files import name_to_code
from core.path import get_resource_path
from core.state import FileObject
from core.state import FileSet
from core.state import Metadata
Expand Down Expand Up @@ -37,7 +39,7 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
elif event == ResourceEvent.DESCRIPTION:
resource.description = value
elif event == ResourceEvent.ENCODING_FORMAT:
resource.encoding_format = value
resource.encoding_format = name_to_code(value)
elif event == ResourceEvent.INCLUDES:
resource.includes = value
elif event == ResourceEvent.SHA256:
Expand All @@ -47,6 +49,11 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
elif event == ResourceEvent.CONTENT_SIZE:
resource.content_size = value
elif event == ResourceEvent.CONTENT_URL:
if resource.content_url and value:
old_path = get_resource_path(resource.content_url)
new_path = get_resource_path(value)
if old_path.exists() and not new_path.exists():
old_path.rename(new_path)
resource.content_url = value
elif event == ResourceEvent.TYPE:
metadata: Metadata = st.session_state[Metadata]
Expand Down
1 change: 1 addition & 0 deletions editor/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ mlcroissant
numpy
pandas
pytest
python-magic
rdflib
requests
streamlit
Expand Down
Loading
Loading