mlcommons · marcenacp · Dec 4, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023
@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
     software-properties-common \
     git \
     python3-pip \
+    libmagic1 \
     && rm -rf /var/lib/apt/lists/*
 
 COPY ./ /app/

@@ -18,6 +18,8 @@ npm run cypress:run  # Runs e2e tests in background
 
 You can debug the tests in Github Actions because failed screenshots are uploaded as artifacts.
 
+You may need to install [`libmagic`](https://pypi.org/project/python-magic).
+
 # Create a custom component
 
 Custom components are in `components/`.

@@ -37,3 +37,10 @@ def str_to_mlc_data_type(data_type: str) -> mlc.DataType | None:
         if data_type == str_data_type:
             return mlc_data_type
     return None
+
+
+def mlc_to_str_data_type(data_type: str) -> mlc.DataType | None:
+    for str_data_type, mlc_data_type in zip(STR_DATA_TYPES, MLC_DATA_TYPES):
+        if data_type == mlc_data_type:
+            return str_data_type
+    return None
@@ -4,10 +4,12 @@
 import tempfile
 
 from etils import epath
+import magic
 import pandas as pd
 import requests
 
 from .names import find_unique_name
+from .path import get_resource_path
 from .state import FileObject
 from .state import FileSet
 
@@ -30,6 +32,8 @@ class FileTypes:
         encoding_format="application/vnd.ms-excel",
         extensions=["xls", "xlsx", "xlsm"],
     )
+    GZIP = FileType(name="GZIP", encoding_format="application/gzip", extensions=["gz"])
+    JPEG = FileType(name="JPEG", encoding_format="image/jpeg", extensions=["json"])
     JSON = FileType(
         name="JSON", encoding_format="application/json", extensions=["json"]
     )
@@ -43,19 +47,63 @@ class FileTypes:
         encoding_format="application/vnd.apache.parquet",
         extensions=["parquet"],
     )
+    TAR = FileType(
+        name="Archive (TAR)",
+        encoding_format="application/x-tar",
+        extensions=["tar"],
+    )
+    TXT = FileType(
+        name="Text",
+        encoding_format="plain/text",
+        extensions=["txt"],
+    )
+    ZIP = FileType(
+        name="ZIP",
+        encoding_format="application/zip",
+        extensions=["zip"],
+    )
+
+
+def _full_name(file_type: FileType):
+    return f"{file_type.name} ({file_type.encoding_format})"
 
 
 FILE_TYPES: dict[str, FileType] = {
-    file_type.name: file_type
+    _full_name(file_type): file_type
     for file_type in [
         FileTypes.CSV,
         FileTypes.EXCEL,
+        FileTypes.GZIP,
+        FileTypes.JPEG,
         FileTypes.JSON,
         FileTypes.JSONL,
         FileTypes.PARQUET,
+        FileTypes.TAR,
+        FileTypes.TXT,
+        FileTypes.ZIP,
     ]
 }
 
+ENCODING_FORMATS: dict[str, FileType] = {
+    file_type.encoding_format: file_type for file_type in FILE_TYPES.values()
+}
+
+
+def name_to_code(file_type_name: str) -> str | None:
+    """Maps names to the encoding format: Text => plain/text."""
+    for name, file_type in FILE_TYPES.items():
+        if file_type_name == name:
+            return file_type.encoding_format
+    return None
+
+
+def code_to_index(encoding_format: str) -> int | None:
+    """Maps the encoding format to its index in the list of keys: plain/text => 12."""
+    for i, file_type in enumerate(FILE_TYPES.values()):
+        if file_type.encoding_format == encoding_format:
+            return i
+    return None
+
 
 def _sha256(content: bytes):
     """Computes the sha256 digest of the byte string."""
@@ -84,58 +132,91 @@ def download_file(url: str, file_path: epath.Path):
 def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
     """Gets the df associated to the file."""
     if file_type == FileTypes.CSV:
-        return pd.read_csv(file)
+        df = pd.read_csv(file)
     elif file_type == FileTypes.EXCEL:
-        return pd.read_excel(file)
+        df = pd.read_excel(file)
     elif file_type == FileTypes.JSON:
-        return pd.read_json(file)
+        df = pd.read_json(file)
     elif file_type == FileTypes.JSONL:
-        return pd.read_json(file, lines=True)
+        df = pd.read_json(file, lines=True)
     elif file_type == FileTypes.PARQUET:
-        return pd.read_parquet(file)
+        df = pd.read_parquet(file)
     else:
         raise NotImplementedError()
+    return df.infer_objects()
+
 
+def guess_file_type(path: epath.Path) -> FileType | None:
+    mime = magic.from_file(path, mime=True)
+    return ENCODING_FORMATS.get(mime)
 
-def file_from_url(file_type: FileType, url: str, names: set[str]) -> FileObject:
+
+def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
     """Downloads locally and extracts the file information."""
     file_path = hash_file_path(url)
     if not file_path.exists():
         download_file(url, file_path)
     with file_path.open("rb") as file:
         sha256 = _sha256(file.read())
-    df = get_dataframe(file_type, file_path).infer_objects()
+    file_type = guess_file_type(file_path)
+    df = get_dataframe(file_type, file_path)
     return FileObject(
         name=find_unique_name(names, url.split("/")[-1]),
         description="",
         content_url=url,
         encoding_format=file_type.encoding_format,
         sha256=sha256,
         df=df,
+        folder=folder,
     )
 
 
 def file_from_upload(
-    file_type: FileType, file: io.BytesIO, names: set[str]
+    file: io.BytesIO, names: set[str], folder: epath.Path
 ) -> FileObject:
     """Uploads locally and extracts the file information."""
-    sha256 = _sha256(file.getvalue())
-    df = get_dataframe(file_type, file).infer_objects()
+    value = file.getvalue()
+    content_url = f"data/{file.name}"
+    sha256 = _sha256(value)
+    file_path = get_resource_path(content_url)
+    with file_path.open("wb") as f:
+        f.write(value)
+    file_type = guess_file_type(file_path)
+    df = get_dataframe(file_type, file)
     return FileObject(
         name=find_unique_name(names, file.name),
         description="",
-        content_url=f"data/{file.name}",
+        content_url=content_url,
         encoding_format=file_type.encoding_format,
         sha256=sha256,
         df=df,
+        folder=folder,
     )
 
 
-def file_from_form(type: str, names: set[str]) -> FileObject | FileSet:
+def file_from_form(
+    type: str, names: set[str], folder: epath.Path
+) -> FileObject | FileSet:
     """Creates a file based on manually added fields."""
     if type == FILE_OBJECT:
-        return FileObject(name=find_unique_name(names, "file_object"))
+        return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
     elif type == FILE_SET:
-        return FileSet(name=find_unique_name(names, "file_set"))
+        return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
     else:
         raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
+
+
+def is_url(file: FileObject) -> bool:
+    return file.content_url and file.content_url.startswith("http")
+
+
+def trigger_download(file: FileObject):
+    if is_url(file):
+        file_path = hash_file_path(file.content_url)
+        if not file_path.exists():
+            download_file(file.content_url, file_path)
+    else:
+        file_path = get_resource_path(file.content_url)
+    file_type = guess_file_type(file_path)
+    df = get_dataframe(file_type, file_path)
+    file.df = df
@@ -1,12 +1,17 @@
+from unittest import mock
+
 from etils import epath
 import pandas as pd
 import pytest
 
-from .files import file_from_url
-from .files import FileTypes
+from core import files as files_module
+
+FileTypes = files_module.FileTypes
 
 
-def test_check_file_csv():
+@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
+def test_check_file_csv(guess_file_type):
+    del guess_file_type
     csv = epath.Path(
         # This is the hash path for "https://my.url".
         "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
@@ -18,10 +23,14 @@ def test_check_file_csv():
         f.write("a,1\n")
         f.write("b,2\n")
         f.write("c,3\n")
-    file = file_from_url(FileTypes.CSV, "https://my.url", set())
+    file = files_module.file_from_url("https://my.url", set(), epath.Path())
     pd.testing.assert_frame_equal(
         file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
     )
-    # Fails with unknown encoding_format:
+
+
+@mock.patch.object(files_module, "guess_file_type", return_value="unknown")
+def test_check_file_unknown(guess_file_type):
+    del guess_file_type
     with pytest.raises(NotImplementedError):
-        file_from_url("unknown", "https://my.url", set())
+        files_module.file_from_url("https://my.url", set(), epath.Path())
@@ -7,6 +7,7 @@
 from core.constants import PAST_PROJECTS_PATH
 from core.query_params import set_project
 from core.state import CurrentProject
+from core.state import FileObject
 from core.state import get_cached_user
 from core.state import Metadata
 
@@ -23,13 +24,17 @@ def _pickle_file(path: epath.Path) -> epath.Path:
 
 
 def save_current_project():
-    metadata = st.session_state[Metadata]
+    metadata: Metadata = st.session_state[Metadata]
     project = st.session_state.get(CurrentProject)
     if not project:
         project = CurrentProject.create_new()
         st.session_state[CurrentProject] = project
     project.path.mkdir(parents=True, exist_ok=True)
     set_project(project)
+    # FileObjects should have a folder.
+    for resource in metadata.distribution:
+        if isinstance(resource, FileObject):
+            resource.folder = project.path
     try:
         pickled = pickle.dumps(metadata)
         _pickle_file(project.path).write_bytes(pickled)

@@ -0,0 +1,13 @@
+from etils import epath
+import streamlit as st
+
+from core.state import CurrentProject
+
+
+def get_resource_path(content_url: str) -> epath.Path:
+    """Gets the path on disk of the resource with `content_url`."""
+    project: CurrentProject = st.session_state[CurrentProject]
+    path = project.path / content_url
+    if not path.parent.exists():
+        path.parent.mkdir(parents=True, exist_ok=True)
+    return path
@@ -137,6 +137,7 @@ class FileObject:
     sha256: str | None = None
     df: pd.DataFrame | None = None
     rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
+    folder: epath.PathLike | None = None
 
 
 @dataclasses.dataclass

@@ -34,7 +34,7 @@ describe('Renaming of FileObjects/FileSets/RecordSets/Fields.', () => {
       getBody().contains('Record Sets').click()
     })
     cy.contains('genders').click()
-    cy.contains('Edit fields details').click()
+    cy.contains('Edit fields details').click({force: true})
     cy.contains('the-new-name')
   })
 })
@@ -52,9 +52,9 @@ describe('Editor loads a local CSV as a resource', () => {
     cy.contains('base.csv_record_set (2 fields)').click()
     // We also see the fields with the proper types.
     cy.get('[data-testid="stDataFrameResizable"]').contains("column1")
-    cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Text")
+    cy.get('[data-testid="stDataFrameResizable"]').contains("Text")
     cy.get('[data-testid="stDataFrameResizable"]').contains("column2")
-    cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Integer")
+    cy.get('[data-testid="stDataFrameResizable"]').contains("Integer")
 
     // I can edit the details of the fields.
     cy.contains('Generating the dataset...').should('not.exist')

@@ -4,6 +4,8 @@
 import streamlit as st
 
 from core.files import FILE_OBJECT
+from core.files import name_to_code
+from core.path import get_resource_path
 from core.state import FileObject
 from core.state import FileSet
 from core.state import Metadata
@@ -37,7 +39,7 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
     elif event == ResourceEvent.DESCRIPTION:
         resource.description = value
     elif event == ResourceEvent.ENCODING_FORMAT:
-        resource.encoding_format = value
+        resource.encoding_format = name_to_code(value)
     elif event == ResourceEvent.INCLUDES:
         resource.includes = value
     elif event == ResourceEvent.SHA256:
@@ -47,6 +49,11 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
     elif event == ResourceEvent.CONTENT_SIZE:
         resource.content_size = value
     elif event == ResourceEvent.CONTENT_URL:
+        if resource.content_url and value:
+            old_path = get_resource_path(resource.content_url)
+            new_path = get_resource_path(value)
+            if old_path.exists() and not new_path.exists():
+                old_path.rename(new_path)
         resource.content_url = value
     elif event == ResourceEvent.TYPE:
         metadata: Metadata = st.session_state[Metadata]

@@ -3,6 +3,7 @@ mlcroissant
 numpy
 pandas
 pytest
+python-magic
 rdflib
 requests
 streamlit
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ mlcroissant @@
     numpy
     pandas
     pytest
+    python-magic
     rdflib
     requests
     streamlit
@@ Expand Down @@