add data

Signed-off-by: Zhiyuan Chen <[email protected]>
DLS5-Omics · Sep 6, 2024 · 8d455a6 · 8d455a6
1 parent e508b28
commit 8d455a6
Show file tree

Hide file tree

Showing 15 changed files with 579 additions and 5 deletions.
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
@@ -18,14 +18,16 @@ jobs:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: true
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
           cache: "pip"
       - name: Install dependencies for testing
-        run: pip install pytest pytest-cov torch torchvision
+        run: pip install pytest pytest-cov
       - name: Install module
-        run: pip install -e .
+        run: pip install -r requirements.txt && pip install -e .
       - name: pytest
         run: pytest --cov=materialx --cov-report=xml --cov-report=html .
       - name: Upload coverage report for documentation
@@ -83,11 +85,11 @@ jobs:
   release:
     if: startsWith(github.event.ref, 'refs/tags/v')
     needs: [lint, test]
+    environment: pypi
     permissions:
       contents: write
       id-token: write
     runs-on: ubuntu-latest
-    environment: pypi
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
@@ -110,6 +112,7 @@ jobs:
   develop:
     if: contains(fromJson('["refs/heads/master", "refs/heads/main"]'), github.ref)
     needs: [lint, test]
+    environment: pypi
     permissions:
       contents: write
     runs-on: ubuntu-latest

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "data"]
+	path = data
+	url = [email protected]:MultiMolecule/data.git
diff --git a/data b/data
diff --git a/docs/docs/data/dataset.md b/docs/docs/data/dataset.md
@@ -0,0 +1,9 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# Dataset
+
+::: multimolecule.data.Dataset
diff --git a/docs/docs/data/index.md b/docs/docs/data/index.md
@@ -0,0 +1,9 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# data
+
+--8<-- "multimolecule/data/README.md:8:"
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -9,6 +9,9 @@ repo_url: https://github.com/DLS5-Omics/multimolecule
 
 nav:
   - index.md
+  - data:
+      - data.md
+      - Dataset: data/dataset.md
   - module:
       - module/index.md
       - heads: module/heads.md
@@ -182,6 +185,8 @@ plugins:
             - https://docs.python.org/3/objects.inv
             - https://pytorch.org/docs/stable/objects.inv
             - https://huggingface.co/docs/transformers/master/en/objects.inv
+            - https://huggingface.co/docs/datasets/master/en/objects.inv
+            - https://pandas.pydata.org/docs/objects.inv
             - https://danling.org/objects.inv
             - https://chanfig.danling.org/objects.inv
   - section-index

diff --git a/multimolecule/__init__.py b/multimolecule/__init__.py
@@ -14,7 +14,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-from . import models, tokenisers
+from .data import Dataset
 from .models import (
     AutoModelForContactPrediction,
     AutoModelForNucleotidePrediction,
@@ -136,11 +136,11 @@
 __all__ = [
     "modeling_auto",
     "modeling_outputs",
+    "Dataset",
     "PreTrainedConfig",
     "HeadConfig",
     "BaseHeadConfig",
     "MaskedLMHeadConfig",
-    "tokenisers",
     "DnaTokenizer",
     "RnaTokenizer",
     "ProteinTokenizer",

diff --git a/multimolecule/data/README.md b/multimolecule/data/README.md
@@ -0,0 +1,18 @@
+---
+authors:
+  - Zhiyuan Chen
+date: 2024-05-04
+---
+
+# data
+
+`data` provides a collection of data processing utilities for handling data.
+
+While :hugs: [`datasets`](https://huggingface.co/docs/datasets) is a powerful library for managing datasets, it is a general-purpose tool that may not cover all the specific functionalities of scientific applications.
+
+The `data` package is designed to complement [`datasets`](https://huggingface.co/docs/datasets) by offering additional data processing utilities that are commonly used in scientific tasks.
+
+## Key Features
+
+- Data Pre-Processing: [`Dataset`][multimolecule.Dataset] is a base class that provides a consistent interface for pre-processing data. It includes methods for identifying the data columns, tokenizing sequences, and batching.
+- Data Loading: [`PandasDataset`][multimolecule.PandasDataset] is a subclass of [`Dataset`][multimolecule.Dataset] that loads data in a [`DataFrame`][pandas.DataFrame] compatible format. This provides a convenient way to work with many common data formats, including CSV, JSON, and Excel files.
diff --git a/multimolecule/data/__init__.py b/multimolecule/data/__init__.py
@@ -0,0 +1,20 @@
+# MultiMolecule
+# Copyright (C) 2024-Present  MultiMolecule
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .dataset import Dataset
+from .utils import no_collate
+
+__all__ = ["Dataset", "no_collate"]