diff --git a/.coverage b/.coverage new file mode 100644 index 00000000..c80827bf Binary files /dev/null and b/.coverage differ diff --git a/.coverage.mac.lan.53520.XpgnFHRx b/.coverage.mac.lan.53520.XpgnFHRx new file mode 100644 index 00000000..b0d42a34 Binary files /dev/null and b/.coverage.mac.lan.53520.XpgnFHRx differ diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..a7d0a9fb --- /dev/null +++ b/.coveragerc @@ -0,0 +1,19 @@ +[run] +source = foundry +omit = + */tests/* + */site-packages/* + setup.py + +[report] +exclude_lines = + pragma: no cover + def __repr__ + raise NotImplementedError + if __name__ == .__main__.: + pass + raise ImportError + except ImportError: + +[html] +directory = coverage_html \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 10d6a310..3acc0989 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,55 +1,57 @@ -name: tests +name: Tests -on: - pull_request: - branches: - - dev - - main +on: [push, pull_request] jobs: - - build: + test: runs-on: ubuntu-latest - timeout-minutes: 20 strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - + python-version: ['3.9', '3.10', '3.11'] + install-type: [core, all] env: CLIENT_ID: ${{ secrets.CLIENT_ID }} CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }} - name: build + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache : 'pip' - - - name: Globus auth - run: 'echo "$GLOBUS_CONFIG" > ~/.globus-native-apps.cfg' - shell: bash - env: - GLOBUS_CONFIG: "${{ secrets.GLOBUS_CONFIG }}" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install -r test-requirements.txt - - - name: Lint with flake8 - run: | - # stop the build if there are any-flake8 comments - flake8 foundry - - - name: Test with pytest - run: | - pytest -s -v tests/ --cov=./foundry --cov-report=xml - - name: Upload coverage to Codecov - run: | - curl -Os https://uploader.codecov.io/v0.1.0_4653/linux/codecov - - chmod +x codecov - ./codecov -t ${{ secrets.CODECOV_TOKEN }} + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + # Install base package + if [ "${{ matrix.install-type }}" = "core" ]; then + pip install -e . + else + pip install -e .[all] + fi + # Install test dependencies + pip install -r requirements-dev.txt + pip install -r test-requirements.txt + # Install optional dependencies for testing + pip install rdkit pandas numpy h5py + + - name: Run tests with coverage + run: | + if [ "${{ matrix.install-type }}" = "core" ]; then + # Skip tests that require optional dependencies + pytest --cov=foundry --cov-report=xml --cov-report=term-missing \ + --ignore=tests/test_molecular.py \ + --ignore=tests/test_ml_frameworks.py \ + --ignore=tests/test_loaders_specialized.py + else + pytest --cov=foundry --cov-report=xml --cov-report=term-missing + fi + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + fail_ci_if_error: false + verbose: true diff --git a/README.md b/README.md index bc72d143..52125cb8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - @@ -9,6 +8,7 @@ [![Tests](https://github.com/MLMI2-CSSI/foundry/actions/workflows/python-publish.yml/badge.svg)](https://github.com/MLMI2-CSSI/foundry/actions/workflows/python-publish.yml) [![NSF-1931306](https://img.shields.io/badge/NSF-1931306-blue)](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1931306&HistoricalAwards=false) [](https://ai-materials-and-chemistry.gitbook.io/foundry/) +[![Coverage](https://codecov.io/gh/MLMI2-CSSI/foundry/branch/main/graph/badge.svg)](https://codecov.io/gh/MLMI2-CSSI/foundry) Foundry-ML simplifies the discovery and usage of ML-ready datasets in materials science and chemistry providing a simple API to access even complex datasets. @@ -30,42 +30,61 @@ DLHub documentation for model publication and running information can be found [ Install Foundry-ML via command line with: `pip install foundry_ml` -You can use the following code to import and instantiate Foundry-ML, then load a dataset. +You can use the following code to import and instantiate Foundry-ML, then load a dataset: ```python from foundry import Foundry + f = Foundry(index="mdf") +results_df = f.search(query="materials science", limit=10) +print(results_df.head()) +``` +Below is an example of publishing your own dataset with Foundry: -f = f.load("10.18126/e73h-3w6n", globus=True) +```python +# Let's assume you have a local folder of data you'd like to publish +from foundry.foundry_dataset import FoundryDataset + +dataset = FoundryDataset(dataset_name="MyNewDataset") +dataset.add_data("/path/to/local_data_folder") # Make sure to have the correct structure + +# Then publish the dataset +res = f.publish_dataset(dataset, update=False, test=False) +print("Dataset submitted with response:", res) ``` -*NOTE*: If you run locally and don't want to install the [Globus Connect Personal endpoint](https://www.globus.org/globus-connect-personal), just set the `globus=False`. -If running this code in a notebook, a table of metadata for the dataset will appear: +If you run locally and don't want to install the [Globus Connect Personal endpoint](https://www.globus.org/globus-connect-personal), just set the `globus=False` when loading datasets. -metadata +# How to Contribute -We can use the data with `f.load_data()` and specifying splits such as `train` for different segments of the dataset, then use matplotlib to visualize it. +We welcome contributions from the community to enhance Foundry-ML. Whether you want to fix a bug, propose a new feature, or improve documentation, follow the steps below to get started: -```python -res = f.load_data() +1. Fork the repository: + - Click the "Fork" button at the top of this repository's page. -imgs = res['train']['input']['imgs'] -desc = res['train']['input']['metadata'] -coords = res['train']['target']['coords'] +2. Clone your fork locally: + - git clone https://github.com/your-username/foundry.git -n_images = 3 -offset = 150 -key_list = list(res['train']['input']['imgs'].keys())[0+offset:n_images+offset] +3. Create a new branch for your feature or fix: + - git checkout -b feature/my-new-feature -fig, axs = plt.subplots(1, n_images, figsize=(20,20)) -for i in range(n_images): - axs[i].imshow(imgs[key_list[i]]) - axs[i].scatter(coords[key_list[i]][:,0], coords[key_list[i]][:,1], s = 20, c = 'r', alpha=0.5) -``` -Screen Shot 2022-10-20 at 2 22 43 PM +4. Install dependencies and set up a virtual environment if needed: + - pip install -r requirements.txt + +5. Make your changes and write tests: + - For code-related changes, add or update tests under tests/ to ensure ongoing stability. + +6. Run tests to confirm everything works: + - pytest + +7. Commit your changes and push the branch to GitHub: + - git push origin feature/my-new-feature -[See full examples](./examples) +8. Create a Pull Request: + - On GitHub, open a Pull Request from your branch to the main branch of MLMI2-CSSI/foundry. + +Our team will review your submission and provide feedback. Thank you for helping us grow Foundry-ML! # How to Cite If you find Foundry-ML useful, please cite the following [paper](https://doi.org/10.21105/joss.05467) @@ -108,3 +127,45 @@ https://www.dlhub.org ## The Materials Data Facility This work was performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the [Center for Hierarchical Material Design (CHiMaD)](http://chimad.northwestern.edu). This work was performed under the following financial assistance award 70NANB19H005 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Materials Design (CHiMaD). This work was also supported by the National Science Foundation as part of the [Midwest Big Data Hub](http://midwestbigdatahub.org) under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design (IMaD): Leverage, Innovate, and Disseminate". https://www.materialsdatafacility.org + +## Installation + +Basic installation: +```bash +pip install foundry_ml +``` + +With optional features: +```bash +# For molecular data support +pip install foundry_ml[molecular] + +# For PyTorch integration +pip install foundry_ml[torch] + +# For TensorFlow integration +pip install foundry_ml[tensorflow] + +# Install all optional dependencies +pip install foundry_ml[all] +``` + +## Development Installation + +For development, you can install all dependencies including testing tools: + +```bash +# Clone the repository +git clone https://github.com/MLMI2-CSSI/foundry.git +cd foundry + +# Install in development mode with all extras +pip install -e .[all] + +# Install development dependencies +pip install -r requirements-dev.txt + +# Run tests +pytest # Run all tests +pytest --ignore=tests/test_molecular.py # Skip tests requiring optional dependencies +``` diff --git a/data/https_test/test_data.json b/data/https_test/test_data.json new file mode 100644 index 00000000..1009fa5f --- /dev/null +++ b/data/https_test/test_data.json @@ -0,0 +1 @@ +"[{\"A\":0.643870455,\"B\":0.6091335397,\"C\":0.7822690824,\"D\":0.8268479159},{\"A\":0.9456540646,\"B\":0.3326369296,\"C\":0.8841205492,\"D\":0.2556220774},{\"A\":0.6291401542,\"B\":0.2247617025,\"C\":0.8462314772,\"D\":0.5304308964},{\"A\":0.975547697,\"B\":0.5416567183,\"C\":0.9508777456,\"D\":0.2569899117},{\"A\":0.4010386755,\"B\":0.4127109481,\"C\":0.017865727,\"D\":0.421698277},{\"A\":0.7083439555,\"B\":0.0440585057,\"C\":0.8736216744,\"D\":0.5628843021},{\"A\":0.3236709921,\"B\":0.074779633,\"C\":0.5776595747,\"D\":0.2022646601},{\"A\":0.4839563307,\"B\":0.425772453,\"C\":0.8387236919,\"D\":0.4334753751},{\"A\":0.5215194358,\"B\":0.2194216689,\"C\":0.6183393944,\"D\":0.797299772},{\"A\":0.0713460804,\"B\":0.3466899498,\"C\":0.6522131807,\"D\":0.1778979284},{\"A\":0.2707517899,\"B\":0.095771952,\"C\":0.1290605884,\"D\":0.9248904319},{\"A\":0.8716852911,\"B\":0.9444690268,\"C\":0.1294852031,\"D\":0.5068766826},{\"A\":0.5843460535,\"B\":0.9755514645,\"C\":0.9818506546,\"D\":0.3279229361},{\"A\":0.3007062433,\"B\":0.6231023141,\"C\":0.7639033834,\"D\":0.8988798453},{\"A\":0.7695470502,\"B\":0.0501137742,\"C\":0.6176087152,\"D\":0.1445764886},{\"A\":0.2675470175,\"B\":0.4640297176,\"C\":0.8329810114,\"D\":0.6570328862},{\"A\":0.7674771752,\"B\":0.1118872796,\"C\":0.7793321285,\"D\":0.1543847316},{\"A\":0.6721579139,\"B\":0.6553073897,\"C\":0.2456219058,\"D\":0.4863060138},{\"A\":0.8931107755,\"B\":0.4741801354,\"C\":0.7188146988,\"D\":0.3763431712},{\"A\":0.5469096954,\"B\":0.8118171604,\"C\":0.4255865379,\"D\":0.2218850497},{\"A\":0.9271195751,\"B\":0.8887750214,\"C\":0.7481395646,\"D\":0.8030491361},{\"A\":0.2463806012,\"B\":0.7847482354,\"C\":0.8679944882,\"D\":0.9079702233},{\"A\":0.6210355879,\"B\":0.9386518629,\"C\":0.701089623,\"D\":0.2504838653},{\"A\":0.8492633109,\"B\":0.3586585934,\"C\":0.0629805626,\"D\":0.9214151715},{\"A\":0.5424288564,\"B\":0.4999920911,\"C\":0.3261323566,\"D\":0.1618166139},{\"A\":0.2888237847,\"B\":0.9141131506,\"C\":0.0343475971,\"D\":0.8115089618},{\"A\":0.2522191092,\"B\":0.345685169,\"C\":0.7884565197,\"D\":0.0355617466},{\"A\":0.4457942938,\"B\":0.6971631221,\"C\":0.5781304242,\"D\":0.1200273581},{\"A\":0.6002382923,\"B\":0.4484248841,\"C\":0.7621932221,\"D\":0.7981014515},{\"A\":0.3691042403,\"B\":0.5542810312,\"C\":0.1286744678,\"D\":0.0136204246},{\"A\":0.4810123215,\"B\":0.6406599704,\"C\":0.6024308844,\"D\":0.1024778434},{\"A\":0.8038373911,\"B\":0.9133361012,\"C\":0.3962806696,\"D\":0.0077623626},{\"A\":0.0780094832,\"B\":0.9021410901,\"C\":0.6492799769,\"D\":0.6116689937},{\"A\":0.3045182842,\"B\":0.8557673608,\"C\":0.1466120067,\"D\":0.5995064907},{\"A\":0.6430901418,\"B\":0.2708746313,\"C\":0.1527527593,\"D\":0.3692156059},{\"A\":0.5106498724,\"B\":0.0371026187,\"C\":0.1790488679,\"D\":0.6432881157},{\"A\":0.0016580644,\"B\":0.6681410215,\"C\":0.4205885498,\"D\":0.8184233958},{\"A\":0.0964162811,\"B\":0.3384898573,\"C\":0.8300644236,\"D\":0.4340685704},{\"A\":0.8950115298,\"B\":0.9260489775,\"C\":0.0742238435,\"D\":0.2144982415},{\"A\":0.8115382564,\"B\":0.1321103032,\"C\":0.0943974514,\"D\":0.0431889336},{\"A\":0.7288958605,\"B\":0.9392151718,\"C\":0.7556298045,\"D\":0.1455506361},{\"A\":0.1091433253,\"B\":0.1931513453,\"C\":0.6288272134,\"D\":0.8938803543},{\"A\":0.5666071526,\"B\":0.6866406665,\"C\":0.3236813586,\"D\":0.905914349},{\"A\":0.0742907865,\"B\":0.5449429136,\"C\":0.6803403721,\"D\":0.2261924153},{\"A\":0.9670714577,\"B\":0.3705962079,\"C\":0.6221720406,\"D\":0.9565497286},{\"A\":0.8795892767,\"B\":0.503880071,\"C\":0.6675467877,\"D\":0.8789986354},{\"A\":0.2085745935,\"B\":0.2422616603,\"C\":0.6609245125,\"D\":0.5574753552},{\"A\":0.3901471816,\"B\":0.4636722689,\"C\":0.6276594441,\"D\":0.9157642311},{\"A\":0.4989495397,\"B\":0.8789409087,\"C\":0.8639646997,\"D\":0.4271356786},{\"A\":0.6699409813,\"B\":0.1002664854,\"C\":0.2746497369,\"D\":0.9139472198},{\"A\":0.9931311364,\"B\":0.8188230319,\"C\":0.0256276884,\"D\":0.7103069326},{\"A\":0.651548713,\"B\":0.564519124,\"C\":0.2685295664,\"D\":0.5504980797},{\"A\":0.6783159928,\"B\":0.8582392923,\"C\":0.0106426152,\"D\":0.343881835},{\"A\":0.9125146347,\"B\":0.8098421837,\"C\":0.1570679367,\"D\":0.7426108926},{\"A\":0.5605342219,\"B\":0.5531496696,\"C\":0.9762101759,\"D\":0.9150743664},{\"A\":0.4589324575,\"B\":0.3824806997,\"C\":0.4509337436,\"D\":0.0977040707},{\"A\":0.6860159363,\"B\":0.3468538712,\"C\":0.4075233003,\"D\":0.8304722601},{\"A\":0.2042029477,\"B\":0.8559035537,\"C\":0.1758115017,\"D\":0.2806214757},{\"A\":0.3396679638,\"B\":0.108741023,\"C\":0.0471056095,\"D\":0.549568292},{\"A\":0.3966874957,\"B\":0.5983624282,\"C\":0.7548218378,\"D\":0.6888001455},{\"A\":0.2496667117,\"B\":0.9411866804,\"C\":0.5447466736,\"D\":0.6027201657},{\"A\":0.6803464783,\"B\":0.6756867895,\"C\":0.1776041426,\"D\":0.8260296751},{\"A\":0.5324327065,\"B\":0.8164624535,\"C\":0.0345776987,\"D\":0.6946683903},{\"A\":0.3278234112,\"B\":0.7767899593,\"C\":0.4236985558,\"D\":0.7855641902},{\"A\":0.1871835033,\"B\":0.7185915704,\"C\":0.8892584241,\"D\":0.9041425981},{\"A\":0.0252279047,\"B\":0.1327784456,\"C\":0.4012167679,\"D\":0.6827824908},{\"A\":0.3242069607,\"B\":0.5819640332,\"C\":0.8668542152,\"D\":0.1434934146},{\"A\":0.8318364286,\"B\":0.0538128288,\"C\":0.465820118,\"D\":0.920627827},{\"A\":0.4278724113,\"B\":0.1968834312,\"C\":0.0518638606,\"D\":0.6344022855},{\"A\":0.8190435048,\"B\":0.8062720847,\"C\":0.2882240224,\"D\":0.865150816},{\"A\":0.2061553601,\"B\":0.2758599846,\"C\":0.5853995293,\"D\":0.6143356356},{\"A\":0.3124749213,\"B\":0.8534254958,\"C\":0.2368244264,\"D\":0.0347652254},{\"A\":0.2462624136,\"B\":0.7669510032,\"C\":0.4371029225,\"D\":0.8295093479},{\"A\":0.4809361558,\"B\":0.794014646,\"C\":0.7514307895,\"D\":0.5129345564},{\"A\":0.3361090064,\"B\":0.0592829927,\"C\":0.2856413958,\"D\":0.4455347327},{\"A\":0.650641595,\"B\":0.0793956841,\"C\":0.928293866,\"D\":0.4721011054},{\"A\":0.2117624986,\"B\":0.0142221126,\"C\":0.2005417756,\"D\":0.8059841184},{\"A\":0.4660735224,\"B\":0.5215475799,\"C\":0.7693073705,\"D\":0.8356910391},{\"A\":0.0885559958,\"B\":0.0119811352,\"C\":0.3977151269,\"D\":0.1626498842},{\"A\":0.4535536676,\"B\":0.2507941702,\"C\":0.8279387846,\"D\":0.1019767725},{\"A\":0.503070451,\"B\":0.6063300971,\"C\":0.4454432603,\"D\":0.0517681478},{\"A\":0.6022218039,\"B\":0.7411709954,\"C\":0.9960575021,\"D\":0.9662997087},{\"A\":0.3314992037,\"B\":0.9145246624,\"C\":0.3891086247,\"D\":0.8709633507},{\"A\":0.65170299,\"B\":0.220147899,\"C\":0.4263250241,\"D\":0.8495339182},{\"A\":0.0890642543,\"B\":0.697432947,\"C\":0.59118971,\"D\":0.932403869},{\"A\":0.8927340507,\"B\":0.9653811365,\"C\":0.0352710207,\"D\":0.8088212468},{\"A\":0.44895611,\"B\":0.5863929011,\"C\":0.4340695146,\"D\":0.5461338805},{\"A\":0.8779709692,\"B\":0.7346527936,\"C\":0.5166408555,\"D\":0.7090164912},{\"A\":0.1301537384,\"B\":0.038524793,\"C\":0.1086061957,\"D\":0.5054981954},{\"A\":0.4676610596,\"B\":0.0464128774,\"C\":0.0701372197,\"D\":0.804096783},{\"A\":0.2637587773,\"B\":0.570283762,\"C\":0.8375562828,\"D\":0.6459677418},{\"A\":0.7537321074,\"B\":0.5692118474,\"C\":0.3920713273,\"D\":0.6739361886},{\"A\":0.9293691973,\"B\":0.0942572446,\"C\":0.5260171998,\"D\":0.9192443409},{\"A\":0.0272109634,\"B\":0.905314036,\"C\":0.8146917598,\"D\":0.612996192},{\"A\":0.8257428339,\"B\":0.3017979784,\"C\":0.8840314119,\"D\":0.389652628},{\"A\":0.3656587178,\"B\":0.0713495029,\"C\":0.0356915025,\"D\":0.2670063823},{\"A\":0.2022555318,\"B\":0.0913283538,\"C\":0.8383074628,\"D\":0.3797109131},{\"A\":0.1898597875,\"B\":0.3885636091,\"C\":0.0757421967,\"D\":0.3362250295},{\"A\":0.5289049892,\"B\":0.5188115576,\"C\":0.1688304016,\"D\":0.845068647},{\"A\":0.4126513276,\"B\":0.2554108881,\"C\":0.015179739,\"D\":0.5686325064}]" \ No newline at end of file diff --git a/data/tmp_data.json b/data/tmp_data.json new file mode 100644 index 00000000..1009fa5f --- /dev/null +++ b/data/tmp_data.json @@ -0,0 +1 @@ +"[{\"A\":0.643870455,\"B\":0.6091335397,\"C\":0.7822690824,\"D\":0.8268479159},{\"A\":0.9456540646,\"B\":0.3326369296,\"C\":0.8841205492,\"D\":0.2556220774},{\"A\":0.6291401542,\"B\":0.2247617025,\"C\":0.8462314772,\"D\":0.5304308964},{\"A\":0.975547697,\"B\":0.5416567183,\"C\":0.9508777456,\"D\":0.2569899117},{\"A\":0.4010386755,\"B\":0.4127109481,\"C\":0.017865727,\"D\":0.421698277},{\"A\":0.7083439555,\"B\":0.0440585057,\"C\":0.8736216744,\"D\":0.5628843021},{\"A\":0.3236709921,\"B\":0.074779633,\"C\":0.5776595747,\"D\":0.2022646601},{\"A\":0.4839563307,\"B\":0.425772453,\"C\":0.8387236919,\"D\":0.4334753751},{\"A\":0.5215194358,\"B\":0.2194216689,\"C\":0.6183393944,\"D\":0.797299772},{\"A\":0.0713460804,\"B\":0.3466899498,\"C\":0.6522131807,\"D\":0.1778979284},{\"A\":0.2707517899,\"B\":0.095771952,\"C\":0.1290605884,\"D\":0.9248904319},{\"A\":0.8716852911,\"B\":0.9444690268,\"C\":0.1294852031,\"D\":0.5068766826},{\"A\":0.5843460535,\"B\":0.9755514645,\"C\":0.9818506546,\"D\":0.3279229361},{\"A\":0.3007062433,\"B\":0.6231023141,\"C\":0.7639033834,\"D\":0.8988798453},{\"A\":0.7695470502,\"B\":0.0501137742,\"C\":0.6176087152,\"D\":0.1445764886},{\"A\":0.2675470175,\"B\":0.4640297176,\"C\":0.8329810114,\"D\":0.6570328862},{\"A\":0.7674771752,\"B\":0.1118872796,\"C\":0.7793321285,\"D\":0.1543847316},{\"A\":0.6721579139,\"B\":0.6553073897,\"C\":0.2456219058,\"D\":0.4863060138},{\"A\":0.8931107755,\"B\":0.4741801354,\"C\":0.7188146988,\"D\":0.3763431712},{\"A\":0.5469096954,\"B\":0.8118171604,\"C\":0.4255865379,\"D\":0.2218850497},{\"A\":0.9271195751,\"B\":0.8887750214,\"C\":0.7481395646,\"D\":0.8030491361},{\"A\":0.2463806012,\"B\":0.7847482354,\"C\":0.8679944882,\"D\":0.9079702233},{\"A\":0.6210355879,\"B\":0.9386518629,\"C\":0.701089623,\"D\":0.2504838653},{\"A\":0.8492633109,\"B\":0.3586585934,\"C\":0.0629805626,\"D\":0.9214151715},{\"A\":0.5424288564,\"B\":0.4999920911,\"C\":0.3261323566,\"D\":0.1618166139},{\"A\":0.2888237847,\"B\":0.9141131506,\"C\":0.0343475971,\"D\":0.8115089618},{\"A\":0.2522191092,\"B\":0.345685169,\"C\":0.7884565197,\"D\":0.0355617466},{\"A\":0.4457942938,\"B\":0.6971631221,\"C\":0.5781304242,\"D\":0.1200273581},{\"A\":0.6002382923,\"B\":0.4484248841,\"C\":0.7621932221,\"D\":0.7981014515},{\"A\":0.3691042403,\"B\":0.5542810312,\"C\":0.1286744678,\"D\":0.0136204246},{\"A\":0.4810123215,\"B\":0.6406599704,\"C\":0.6024308844,\"D\":0.1024778434},{\"A\":0.8038373911,\"B\":0.9133361012,\"C\":0.3962806696,\"D\":0.0077623626},{\"A\":0.0780094832,\"B\":0.9021410901,\"C\":0.6492799769,\"D\":0.6116689937},{\"A\":0.3045182842,\"B\":0.8557673608,\"C\":0.1466120067,\"D\":0.5995064907},{\"A\":0.6430901418,\"B\":0.2708746313,\"C\":0.1527527593,\"D\":0.3692156059},{\"A\":0.5106498724,\"B\":0.0371026187,\"C\":0.1790488679,\"D\":0.6432881157},{\"A\":0.0016580644,\"B\":0.6681410215,\"C\":0.4205885498,\"D\":0.8184233958},{\"A\":0.0964162811,\"B\":0.3384898573,\"C\":0.8300644236,\"D\":0.4340685704},{\"A\":0.8950115298,\"B\":0.9260489775,\"C\":0.0742238435,\"D\":0.2144982415},{\"A\":0.8115382564,\"B\":0.1321103032,\"C\":0.0943974514,\"D\":0.0431889336},{\"A\":0.7288958605,\"B\":0.9392151718,\"C\":0.7556298045,\"D\":0.1455506361},{\"A\":0.1091433253,\"B\":0.1931513453,\"C\":0.6288272134,\"D\":0.8938803543},{\"A\":0.5666071526,\"B\":0.6866406665,\"C\":0.3236813586,\"D\":0.905914349},{\"A\":0.0742907865,\"B\":0.5449429136,\"C\":0.6803403721,\"D\":0.2261924153},{\"A\":0.9670714577,\"B\":0.3705962079,\"C\":0.6221720406,\"D\":0.9565497286},{\"A\":0.8795892767,\"B\":0.503880071,\"C\":0.6675467877,\"D\":0.8789986354},{\"A\":0.2085745935,\"B\":0.2422616603,\"C\":0.6609245125,\"D\":0.5574753552},{\"A\":0.3901471816,\"B\":0.4636722689,\"C\":0.6276594441,\"D\":0.9157642311},{\"A\":0.4989495397,\"B\":0.8789409087,\"C\":0.8639646997,\"D\":0.4271356786},{\"A\":0.6699409813,\"B\":0.1002664854,\"C\":0.2746497369,\"D\":0.9139472198},{\"A\":0.9931311364,\"B\":0.8188230319,\"C\":0.0256276884,\"D\":0.7103069326},{\"A\":0.651548713,\"B\":0.564519124,\"C\":0.2685295664,\"D\":0.5504980797},{\"A\":0.6783159928,\"B\":0.8582392923,\"C\":0.0106426152,\"D\":0.343881835},{\"A\":0.9125146347,\"B\":0.8098421837,\"C\":0.1570679367,\"D\":0.7426108926},{\"A\":0.5605342219,\"B\":0.5531496696,\"C\":0.9762101759,\"D\":0.9150743664},{\"A\":0.4589324575,\"B\":0.3824806997,\"C\":0.4509337436,\"D\":0.0977040707},{\"A\":0.6860159363,\"B\":0.3468538712,\"C\":0.4075233003,\"D\":0.8304722601},{\"A\":0.2042029477,\"B\":0.8559035537,\"C\":0.1758115017,\"D\":0.2806214757},{\"A\":0.3396679638,\"B\":0.108741023,\"C\":0.0471056095,\"D\":0.549568292},{\"A\":0.3966874957,\"B\":0.5983624282,\"C\":0.7548218378,\"D\":0.6888001455},{\"A\":0.2496667117,\"B\":0.9411866804,\"C\":0.5447466736,\"D\":0.6027201657},{\"A\":0.6803464783,\"B\":0.6756867895,\"C\":0.1776041426,\"D\":0.8260296751},{\"A\":0.5324327065,\"B\":0.8164624535,\"C\":0.0345776987,\"D\":0.6946683903},{\"A\":0.3278234112,\"B\":0.7767899593,\"C\":0.4236985558,\"D\":0.7855641902},{\"A\":0.1871835033,\"B\":0.7185915704,\"C\":0.8892584241,\"D\":0.9041425981},{\"A\":0.0252279047,\"B\":0.1327784456,\"C\":0.4012167679,\"D\":0.6827824908},{\"A\":0.3242069607,\"B\":0.5819640332,\"C\":0.8668542152,\"D\":0.1434934146},{\"A\":0.8318364286,\"B\":0.0538128288,\"C\":0.465820118,\"D\":0.920627827},{\"A\":0.4278724113,\"B\":0.1968834312,\"C\":0.0518638606,\"D\":0.6344022855},{\"A\":0.8190435048,\"B\":0.8062720847,\"C\":0.2882240224,\"D\":0.865150816},{\"A\":0.2061553601,\"B\":0.2758599846,\"C\":0.5853995293,\"D\":0.6143356356},{\"A\":0.3124749213,\"B\":0.8534254958,\"C\":0.2368244264,\"D\":0.0347652254},{\"A\":0.2462624136,\"B\":0.7669510032,\"C\":0.4371029225,\"D\":0.8295093479},{\"A\":0.4809361558,\"B\":0.794014646,\"C\":0.7514307895,\"D\":0.5129345564},{\"A\":0.3361090064,\"B\":0.0592829927,\"C\":0.2856413958,\"D\":0.4455347327},{\"A\":0.650641595,\"B\":0.0793956841,\"C\":0.928293866,\"D\":0.4721011054},{\"A\":0.2117624986,\"B\":0.0142221126,\"C\":0.2005417756,\"D\":0.8059841184},{\"A\":0.4660735224,\"B\":0.5215475799,\"C\":0.7693073705,\"D\":0.8356910391},{\"A\":0.0885559958,\"B\":0.0119811352,\"C\":0.3977151269,\"D\":0.1626498842},{\"A\":0.4535536676,\"B\":0.2507941702,\"C\":0.8279387846,\"D\":0.1019767725},{\"A\":0.503070451,\"B\":0.6063300971,\"C\":0.4454432603,\"D\":0.0517681478},{\"A\":0.6022218039,\"B\":0.7411709954,\"C\":0.9960575021,\"D\":0.9662997087},{\"A\":0.3314992037,\"B\":0.9145246624,\"C\":0.3891086247,\"D\":0.8709633507},{\"A\":0.65170299,\"B\":0.220147899,\"C\":0.4263250241,\"D\":0.8495339182},{\"A\":0.0890642543,\"B\":0.697432947,\"C\":0.59118971,\"D\":0.932403869},{\"A\":0.8927340507,\"B\":0.9653811365,\"C\":0.0352710207,\"D\":0.8088212468},{\"A\":0.44895611,\"B\":0.5863929011,\"C\":0.4340695146,\"D\":0.5461338805},{\"A\":0.8779709692,\"B\":0.7346527936,\"C\":0.5166408555,\"D\":0.7090164912},{\"A\":0.1301537384,\"B\":0.038524793,\"C\":0.1086061957,\"D\":0.5054981954},{\"A\":0.4676610596,\"B\":0.0464128774,\"C\":0.0701372197,\"D\":0.804096783},{\"A\":0.2637587773,\"B\":0.570283762,\"C\":0.8375562828,\"D\":0.6459677418},{\"A\":0.7537321074,\"B\":0.5692118474,\"C\":0.3920713273,\"D\":0.6739361886},{\"A\":0.9293691973,\"B\":0.0942572446,\"C\":0.5260171998,\"D\":0.9192443409},{\"A\":0.0272109634,\"B\":0.905314036,\"C\":0.8146917598,\"D\":0.612996192},{\"A\":0.8257428339,\"B\":0.3017979784,\"C\":0.8840314119,\"D\":0.389652628},{\"A\":0.3656587178,\"B\":0.0713495029,\"C\":0.0356915025,\"D\":0.2670063823},{\"A\":0.2022555318,\"B\":0.0913283538,\"C\":0.8383074628,\"D\":0.3797109131},{\"A\":0.1898597875,\"B\":0.3885636091,\"C\":0.0757421967,\"D\":0.3362250295},{\"A\":0.5289049892,\"B\":0.5188115576,\"C\":0.1688304016,\"D\":0.845068647},{\"A\":0.4126513276,\"B\":0.2554108881,\"C\":0.015179739,\"D\":0.5686325064}]" \ No newline at end of file diff --git a/foundry/foundry.py b/foundry/foundry.py index a3d0ca5b..eaa31b74 100644 --- a/foundry/foundry.py +++ b/foundry/foundry.py @@ -219,13 +219,18 @@ def search(self, query: str = None, limit: int = None, as_list: bool = False) -> >>> print(len(results)) 10 """ - if (query is not None) and (is_doi(query)): - metadata_list = [self.get_metadata_by_doi(query)] - else: - metadata_list = self.get_metadata_by_query(query, limit) - - if len(metadata_list) == 0: - raise Exception(f"load: No results found for the query '{query}'") + try: + if (query is not None) and (is_doi(query)): + metadata_list = [self.get_metadata_by_doi(query)] + else: + metadata_list = self.get_metadata_by_query(query, limit) + + if len(metadata_list) == 0: + raise NoResultsFoundError(f"No results found for query '{query}'") + + except Exception as e: + logger.error(f"Search failed: {str(e)}", exc_info=True) + raise foundry_datasets = [] for metadata in metadata_list: diff --git a/foundry/foundry_cache.py b/foundry/foundry_cache.py index 7fbfab8a..39b800e7 100644 --- a/foundry/foundry_cache.py +++ b/foundry/foundry_cache.py @@ -10,11 +10,16 @@ import shutil from tqdm.auto import tqdm from typing import List, Any, Tuple +from pathlib import Path from .https_download import recursive_ls, download_file from foundry.jsonschema_models.project_model import Split as FoundrySplit from foundry.models import FoundrySchema -from foundry.utils import _read_csv, _read_json, _read_excel, is_pandas_pytable +from .utils import ( + _read_csv, _read_json, _read_excel, + is_pandas_pytable +) +from .loaders.registry import LoaderRegistry logger = logging.getLogger(__name__) @@ -54,6 +59,7 @@ def __init__(self, self.interval = interval self.parallel_https = parallel_https self.verbose = verbose + self.loader_registry = LoaderRegistry() def download_to_cache(self, dataset_name: str, @@ -342,68 +348,18 @@ def _load_data(self, Returns: tuple: A tuple containing the input and target data. """ - # Build the path to access the cached data - path = os.path.join(self.local_cache_dir, source_id) - if path is None: - raise ValueError(f"Path to data file is invalid; check that dataset source_id is valid: " - f"{source_id or self.mdf['source_id']}") - - # Check for version folders - version_folders = [d for d in os.listdir(path) if re.match(r'\d+\.\d+', d)] - if version_folders: - # Sort version folders and get the latest one - latest_version = sorted(version_folders, key=lambda x: [int(n) for n in x.split('.')], reverse=True)[0] - path = os.path.join(path, latest_version) - print(f"Loading from version folder: {latest_version}") - - path_to_file = os.path.join(path, file) - - # Check to see whether file exists at path - if not os.path.isfile(path_to_file): - raise FileNotFoundError(f"No file found at expected path: {path_to_file}") - - # Handle Foundry-defined types. - if foundry_schema.data_type == "tabular": - # TODO: Add hashes and versioning to metadata and checking to the file - read_fns = [(_read_json, {"lines": False, "path_to_file": path_to_file}), - (_read_json, {"lines": True, "path_to_file": path_to_file}), - (_read_csv, {"path_to_file": path_to_file}), - (_read_excel, {"path_to_file": path_to_file})] - dataframe = None - for fn, params in read_fns: - try: - dataframe = fn(**params) - except Exception as e: - logger.info(f"Unable to read file with {fn.__name__} with params {params}: {e}") - if dataframe is not None: - logger.info(f"Succeeded with {fn.__name__} with params {params}") - break - if dataframe is None: - logger.fatal(f"Cannot read {path_to_file} as tabular data, failed to load") - raise ValueError(f"Cannot read tabular data from {path_to_file}") - return ( - dataframe[self.get_keys(foundry_schema, "input")], - dataframe[self.get_keys(foundry_schema, "target")], - ) - elif foundry_schema.data_type == "hdf5": - f = h5py.File(path_to_file, "r") - special_types = ["input", "target"] - tmp_data = {s: {} for s in special_types} - for s in special_types: - for key in self.get_keys(foundry_schema, s): - if as_hdf5: - tmp_data[s][key] = f[key] - elif isinstance(f[key], h5py.Group): - if is_pandas_pytable(f[key]): - df = pd.read_hdf(path_to_file, key) - tmp_data[s][key] = df - else: - tmp_data[s][key] = f[key] - elif isinstance(f[key], h5py.Dataset): - tmp_data[s][key] = f[key][0:] - return tmp_data - else: - raise NotImplementedError + path = Path(self.local_cache_dir) / source_id / file + + if not path.exists(): + raise FileNotFoundError(f"No file found at: {path}") + + loader = self.loader_registry.get_loader( + path, + foundry_schema.data_type, + self.local_cache_dir + ) + + return loader.load(path, foundry_schema, as_hdf5=as_hdf5) def get_keys(self, foundry_schema: FoundrySchema, diff --git a/foundry/jsonschema_models/config.py b/foundry/jsonschema_models/config.py new file mode 100644 index 00000000..35edff96 --- /dev/null +++ b/foundry/jsonschema_models/config.py @@ -0,0 +1,18 @@ +from typing import Any, Dict +from pydantic import ConfigDict + +def get_model_config(extra: str = 'forbid') -> Dict[str, Any]: + """ + Get standardized model configuration for Pydantic v2 + + Args: + extra: How to handle extra attributes ('allow', 'forbid', or 'ignore') + + Returns: + ConfigDict with standardized settings + """ + return ConfigDict( + extra=extra, + validate_assignment=True, + frozen=False + ) \ No newline at end of file diff --git a/foundry/jsonschema_models/project_model.py b/foundry/jsonschema_models/project_model.py index 60e5cfec..5dee92c6 100644 --- a/foundry/jsonschema_models/project_model.py +++ b/foundry/jsonschema_models/project_model.py @@ -7,6 +7,7 @@ from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, Extra, Field +from .config import get_model_config class Nanomfg(BaseModel): @@ -93,8 +94,8 @@ class Config: class Split(BaseModel): - class Config: - extra = Extra.forbid + """A split of data in the dataset""" + model_config = get_model_config('forbid') type: Optional[str] = Field( None, diff --git a/foundry/loaders/__init__.py b/foundry/loaders/__init__.py index e69de29b..b65f5fe9 100644 --- a/foundry/loaders/__init__.py +++ b/foundry/loaders/__init__.py @@ -0,0 +1,11 @@ +from .base import DataLoader +from .tabular import TabularDataLoader +from .hdf5 import HDF5DataLoader +from .registry import LoaderRegistry + +__all__ = [ + 'DataLoader', + 'TabularDataLoader', + 'HDF5DataLoader', + 'LoaderRegistry' +] diff --git a/foundry/loaders/base.py b/foundry/loaders/base.py new file mode 100644 index 00000000..5b830879 --- /dev/null +++ b/foundry/loaders/base.py @@ -0,0 +1,31 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict, Tuple, Optional +from pathlib import Path + +from foundry.jsonschema_models.project_model import Split +from foundry.models import FoundrySchema + +class DataLoader(ABC): + """Base class for all Foundry data loaders""" + + def __init__(self, cache_dir: str): + self.cache_dir = Path(cache_dir) + + @abstractmethod + def load(self, + file_path: Path, + schema: FoundrySchema, + split: Optional[Split] = None, + as_hdf5: bool = False) -> Tuple[Any, Any]: + """Load data from file and return input/target tuple""" + pass + + @abstractmethod + def supports_format(self, file_path: Path) -> bool: + """Check if this loader supports the given file format""" + pass + + def get_keys(self, schema: FoundrySchema, key_type: str) -> list: + """Extract keys of given type from schema""" + keys = [key.key for key in schema.keys if key.type == key_type] + return [k for sublist in keys for k in sublist] # Flatten \ No newline at end of file diff --git a/foundry/loaders/hdf5.py b/foundry/loaders/hdf5.py new file mode 100644 index 00000000..29aaa567 --- /dev/null +++ b/foundry/loaders/hdf5.py @@ -0,0 +1,37 @@ +import h5py +from pathlib import Path +from typing import Tuple, Any + +from .base import DataLoader +from ..utils import is_pandas_pytable + +class HDF5DataLoader(DataLoader): + """Loader for HDF5 data formats""" + + def supports_format(self, file_path: Path) -> bool: + return file_path.suffix.lower() in {'.h5', '.hdf5'} + + def load(self, file_path: Path, schema, split=None, as_hdf5: bool = False) -> Tuple[Any, Any]: + """Load HDF5 data + + Args: + file_path: Path to HDF5 file + schema: Schema describing the data format + split: Optional split information + as_hdf5: If True, return h5py Dataset objects instead of converting to arrays + + Returns: + Tuple of (input_data, target_data) + """ + with h5py.File(str(file_path), 'r') as f: + input_keys = self.get_keys(schema, "input") + target_keys = self.get_keys(schema, "target") + + if as_hdf5: + inputs = {k: f[k] for k in input_keys} + targets = {k: f[k] for k in target_keys} + else: + inputs = {k: f[k][:] for k in input_keys} + targets = {k: f[k][:] for k in target_keys} + + return inputs, targets \ No newline at end of file diff --git a/foundry/loaders/image.py b/foundry/loaders/image.py new file mode 100644 index 00000000..c64dc49c --- /dev/null +++ b/foundry/loaders/image.py @@ -0,0 +1,43 @@ +from PIL import Image +import numpy as np +from pathlib import Path +from typing import Tuple, Any, List +import json + +from .base import DataLoader + +class ImageDataLoader(DataLoader): + """Loader for image datasets with optional annotations""" + + SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.tiff', '.bmp'} + + def supports_format(self, file_path: Path) -> bool: + return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS + + def load(self, file_path: Path, schema, split=None) -> Tuple[Any, Any]: + """Load image data and associated annotations if they exist""" + # Load image + img = Image.open(file_path) + img_array = np.array(img) + + # Check for annotation file + annotation_path = file_path.with_suffix('.json') + if annotation_path.exists(): + with open(annotation_path) as f: + annotations = json.load(f) + else: + annotations = None + + return img_array, annotations + + def load_batch(self, file_paths: List[Path], schema, split=None) -> Tuple[Any, Any]: + """Load multiple images efficiently""" + images = [] + annotations = [] + + for path in file_paths: + img, annot = self.load(path, schema, split) + images.append(img) + annotations.append(annot) + + return np.stack(images), annotations \ No newline at end of file diff --git a/foundry/loaders/molecular.py b/foundry/loaders/molecular.py new file mode 100644 index 00000000..8532a7f8 --- /dev/null +++ b/foundry/loaders/molecular.py @@ -0,0 +1,66 @@ +from pathlib import Path +from typing import Tuple, Any, Dict +import pandas as pd +import numpy as np + +from .base import DataLoader +from ..utils import optional_import, require_package + +# Lazy import of rdkit +rdkit = optional_import('rdkit') +if rdkit: + from rdkit import Chem + from rdkit.Chem import AllChem, Descriptors + +class MolecularDataLoader(DataLoader): + """Loader for molecular data formats (.sdf, .mol2, .pdb)""" + + SUPPORTED_EXTENSIONS = {'.sdf', '.mol2', '.pdb', '.xyz'} + + def supports_format(self, file_path: Path) -> bool: + return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS + + def load(self, file_path: Path, schema, split=None, as_hdf5: bool = False) -> Tuple[Any, Any]: + if not rdkit: + require_package('molecular', 'MolecularDataLoader') + + ext = file_path.suffix.lower() + + if ext == '.sdf': + mols = list(Chem.SDMolSupplier(str(file_path))) + elif ext == '.mol2': + mols = [Chem.MolFromMol2File(str(file_path))] + elif ext == '.pdb': + mols = [Chem.MolFromPDBFile(str(file_path))] + else: + raise ValueError(f"Unsupported format: {ext}") + + # Convert molecules to features + features = [] + for mol in mols: + if mol is not None: + feat = self._extract_features(mol) + features.append(feat) + + features_array = np.array(features) + + # Look for properties file with target values + props_path = file_path.with_suffix('.csv') + if props_path.exists(): + props_df = pd.read_csv(props_path) + targets = props_df[self.get_keys(schema, "target")].values + else: + targets = None + + return features_array, targets + + def _extract_features(self, mol) -> Dict: + """Extract relevant molecular features""" + return { + 'morgan_fp': AllChem.GetMorganFingerprintAsBitVect(mol, 2), + 'num_atoms': mol.GetNumAtoms(), + 'num_bonds': mol.GetNumBonds(), + 'molecular_weight': Descriptors.ExactMolWt(mol), + 'logp': Descriptors.MolLogP(mol), + 'polar_surface_area': Descriptors.TPSA(mol) + } \ No newline at end of file diff --git a/foundry/loaders/numpy.py b/foundry/loaders/numpy.py new file mode 100644 index 00000000..5098e2d4 --- /dev/null +++ b/foundry/loaders/numpy.py @@ -0,0 +1,26 @@ +import numpy as np +from pathlib import Path +from typing import Tuple, Any + +from .base import DataLoader + +class NumpyDataLoader(DataLoader): + """Loader for NumPy array formats (.npy, .npz)""" + + def supports_format(self, file_path: Path) -> bool: + return file_path.suffix.lower() in {'.npy', '.npz'} + + def load(self, file_path: Path, schema, split=None) -> Tuple[Any, Any]: + if file_path.suffix.lower() == '.npz': + with np.load(file_path) as data: + input_keys = self.get_keys(schema, "input") + target_keys = self.get_keys(schema, "target") + return ( + {k: data[k] for k in input_keys}, + {k: data[k] for k in target_keys} + ) + else: + # For .npy files, assume single array split between input/target + data = np.load(file_path) + n_inputs = len(self.get_keys(schema, "input")) + return data[:, :n_inputs], data[:, n_inputs:] \ No newline at end of file diff --git a/foundry/loaders/parquet.py b/foundry/loaders/parquet.py new file mode 100644 index 00000000..906a0dd8 --- /dev/null +++ b/foundry/loaders/parquet.py @@ -0,0 +1,13 @@ +import pyarrow.parquet as pq +from .base import DataLoader + +class ParquetDataLoader(DataLoader): + def supports_format(self, file_path): + return file_path.suffix.lower() == '.parquet' + + def load(self, file_path, schema, split=None): + df = pq.read_table(file_path).to_pandas() + return ( + df[self.get_keys(schema, "input")], + df[self.get_keys(schema, "target")] + ) \ No newline at end of file diff --git a/foundry/loaders/registry.py b/foundry/loaders/registry.py new file mode 100644 index 00000000..be482e77 --- /dev/null +++ b/foundry/loaders/registry.py @@ -0,0 +1,48 @@ +from pathlib import Path +from typing import Dict, Type + +from ..utils import optional_import + +from .base import DataLoader +from .tabular import TabularDataLoader +from .hdf5 import HDF5DataLoader +from .numpy import NumpyDataLoader +from .image import ImageDataLoader + +class LoaderRegistry: + """Registry for managing available data loaders""" + + def __init__(self): + self._loaders: Dict[str, Type[DataLoader]] = {} + + # Always register core loaders + self.register_loader("tabular", TabularDataLoader) + self.register_loader("hdf5", HDF5DataLoader) + self.register_loader("numpy", NumpyDataLoader) + self.register_loader("image", ImageDataLoader) + + # Conditionally register optional loaders + if optional_import('rdkit'): + from .molecular import MolecularDataLoader + self.register_loader("molecular", MolecularDataLoader) + + if optional_import('jcamp'): + from .spectral import SpectralDataLoader + self.register_loader("spectral", SpectralDataLoader) + + def register_loader(self, name: str, loader_class: Type[DataLoader]): + """Register a new loader class""" + self._loaders[name] = loader_class + + def get_loader(self, file_path: Path, schema_type: str, cache_dir: str) -> DataLoader: + """Get appropriate loader instance for file""" + if schema_type in self._loaders: + return self._loaders[schema_type](cache_dir) + + # Try to find loader that supports the format + for loader_class in self._loaders.values(): + loader = loader_class(cache_dir) + if loader.supports_format(file_path): + return loader + + raise ValueError(f"No suitable loader found for {file_path}") \ No newline at end of file diff --git a/foundry/loaders/spectral.py b/foundry/loaders/spectral.py new file mode 100644 index 00000000..41f594d3 --- /dev/null +++ b/foundry/loaders/spectral.py @@ -0,0 +1,61 @@ +import numpy as np +from pathlib import Path +from typing import Tuple, Any +import pandas as pd +from scipy.io import loadmat + +from .base import DataLoader + +class SpectralDataLoader(DataLoader): + """Loader for spectroscopic data formats""" + + SUPPORTED_EXTENSIONS = {'.txt', '.csv', '.mat', '.jdx', '.dx'} + + def supports_format(self, file_path: Path) -> bool: + return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS + + def load(self, file_path: Path, schema, split=None) -> Tuple[Any, Any]: + ext = file_path.suffix.lower() + + if ext in {'.txt', '.csv'}: + # Assume standard format with wavelength/frequency in first column + data = pd.read_csv(file_path, delimiter=None, engine='python') + x_values = data.iloc[:, 0].values + spectra = data.iloc[:, 1:].values + + elif ext == '.mat': + # MATLAB format + data = loadmat(file_path) + # Assume standard variable names, could be made configurable + x_values = data.get('wavelength', data.get('frequency', None)) + spectra = data.get('spectra', None) + + elif ext in {'.jdx', '.dx'}: + # JCAMP-DX format + try: + import jcamp # Optional dependency + data = jcamp.JCAMP_reader(str(file_path)) + x_values = data['x'] + spectra = data['y'] + except ImportError: + raise ImportError("jcamp-dx package required for .jdx/.dx files") + + else: + raise ValueError(f"Unsupported format: {ext}") + + # Package the spectral data + input_data = { + 'x_values': x_values, + 'spectra': spectra + } + + # Look for metadata/target values + meta_path = file_path.with_suffix('.json') + if meta_path.exists(): + import json + with open(meta_path) as f: + target_data = json.load(f) + else: + target_data = None + + return input_data, target_data \ No newline at end of file diff --git a/foundry/loaders/tabular.py b/foundry/loaders/tabular.py new file mode 100644 index 00000000..70fb5225 --- /dev/null +++ b/foundry/loaders/tabular.py @@ -0,0 +1,56 @@ +import pandas as pd +from pathlib import Path +from typing import Tuple, Any + +from .base import DataLoader +from foundry.utils import _read_csv, _read_json, _read_excel + +class TabularDataLoader(DataLoader): + """Loader for tabular data formats (CSV, JSON, Excel)""" + + SUPPORTED_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls'} + VALID_KEY_TYPES = {'input', 'target', 'output'} + + def supports_format(self, file_path: Path) -> bool: + return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS + + def load(self, file_path: Path, schema, split=None, as_hdf5: bool = False) -> Tuple[Any, Any]: + """Load tabular data and split into input/target + + Args: + file_path: Path to data file + schema: Schema describing the data format + split: Optional split information + as_hdf5: Ignored for tabular data + + Returns: + Tuple of (input_data, target_data) + """ + # Validate schema + if not schema.keys: + raise ValueError("No keys defined in schema") + + # Validate key types + for key in schema.keys: + if key.type not in self.VALID_KEY_TYPES: + raise ValueError(f"Invalid key type: {key.type}. Must be one of {self.VALID_KEY_TYPES}") + + # Load the data based on file type + ext = file_path.suffix.lower() + if ext == '.csv': + df = _read_csv(str(file_path)) + elif ext == '.json': + df = _read_json(str(file_path)) + elif ext in {'.xlsx', '.xls'}: + df = _read_excel(str(file_path)) + else: + raise ValueError(f"Unsupported format: {ext}") + + # Split into input and target features + input_cols = self.get_keys(schema, "input") + target_cols = self.get_keys(schema, "target") or self.get_keys(schema, "output") + + if not input_cols and not target_cols: + raise ValueError("No input or target keys found in schema") + + return df[input_cols], df[target_cols] \ No newline at end of file diff --git a/foundry/loaders/tf_wrapper.py b/foundry/loaders/tf_wrapper.py index 13c5cc2b..74ed7b99 100644 --- a/foundry/loaders/tf_wrapper.py +++ b/foundry/loaders/tf_wrapper.py @@ -1,26 +1,17 @@ -import numpy as np -from tensorflow.keras.utils import Sequence +from ..utils import optional_import, require_package +tf = optional_import('tensorflow') -class TensorflowSequence(Sequence): - """Foundry Dataset Converted to Tensorflow Format""" - +class TensorflowSequence: def __init__(self, inputs, targets): + if not tf: + require_package('tensorflow', 'TensorflowSequence') + self.inputs = inputs self.targets = targets - - def __len__(self): - return len(self.inputs[0]) - + def __getitem__(self, idx): - item = {"input": [], "target": []} - - for input in self.inputs: - item["input"].append(np.array(input[idx])) - item["input"] = np.array(item["input"]) - - for target in self.targets: - item["target"].append(np.array(target[idx])) - item["target"] = np.array(item["target"]) - - return item + return self.inputs[idx], self.targets[idx] + + def __len__(self): + return len(self.inputs) diff --git a/foundry/loaders/torch_wrapper.py b/foundry/loaders/torch_wrapper.py index 6193c098..ecac3fac 100644 --- a/foundry/loaders/torch_wrapper.py +++ b/foundry/loaders/torch_wrapper.py @@ -1,28 +1,17 @@ -import numpy as np -from torch.utils.data import Dataset +from ..utils import optional_import, require_package +torch = optional_import('torch') -class TorchDataset(Dataset): - """Foundry Dataset Converted to Pytorch Format""" - +class TorchDataset: def __init__(self, inputs, targets): + if not torch: + require_package('torch', 'TorchDataset') + self.inputs = inputs self.targets = targets - - def __len__(self): - return len(self.inputs[0]) - + def __getitem__(self, idx): - item = {"input": [], "target": []} - - # adds the correct item at index idx from each input from self.inputs to the item dictionary - for input in self.inputs: - item["input"].append(np.array(input[idx])) - item["input"] = np.array(item["input"]) - - # adds the correct item at index idx from each target from self.targets to the item dictionary - for target in self.targets: - item["target"].append(np.array(target[idx])) - item["target"] = np.array(item["target"]) - - return item + return self.inputs[idx], self.targets[idx] + + def __len__(self): + return len(self.inputs) diff --git a/foundry/utils/__init__.py b/foundry/utils/__init__.py new file mode 100644 index 00000000..167c5984 --- /dev/null +++ b/foundry/utils/__init__.py @@ -0,0 +1,29 @@ +from .io import _read_csv, _read_json, _read_excel +from .validation import is_doi, is_pandas_pytable +from importlib import import_module +from typing import Optional, Any + +def optional_import(module_name: str, package_name: Optional[str] = None) -> Any: + """Attempt to import an optional module""" + try: + return import_module(module_name) + except ImportError: + return None + +def require_package(package_name: str, feature_name: str): + """Raise informative error when an optional package is required but not installed""" + raise ImportError( + f"{feature_name} requires {package_name}. " + f"Install with: pip install foundry_ml[{package_name}] " + f"or pip install {package_name}" + ) + +__all__ = [ + 'optional_import', + 'require_package', + '_read_csv', + '_read_json', + '_read_excel', + 'is_doi', + 'is_pandas_pytable' +] \ No newline at end of file diff --git a/foundry/utils/imports.py b/foundry/utils/imports.py new file mode 100644 index 00000000..02df2f6c --- /dev/null +++ b/foundry/utils/imports.py @@ -0,0 +1,26 @@ +from importlib import import_module +from typing import Optional, Any + +def optional_import(module_name: str, package_name: Optional[str] = None) -> Any: + """ + Attempt to import an optional module. + + Args: + module_name: Name of the module to import + package_name: Name of the package for error message (if different from module_name) + + Returns: + The imported module if successful, None otherwise + """ + try: + return import_module(module_name) + except ImportError: + return None + +def require_package(package_name: str, feature_name: str): + """Raise informative error when an optional package is required but not installed""" + raise ImportError( + f"{feature_name} requires {package_name}. " + f"Install with: pip install foundry_ml[{package_name}] " + f"or pip install {package_name}" + ) \ No newline at end of file diff --git a/foundry/utils/io.py b/foundry/utils/io.py new file mode 100644 index 00000000..4fbc8e19 --- /dev/null +++ b/foundry/utils/io.py @@ -0,0 +1,17 @@ +import pandas as pd +import json +from pathlib import Path + +def _read_csv(path_to_file: str, **kwargs): + """Read a CSV file into a pandas DataFrame""" + return pd.read_csv(path_to_file, **kwargs) + +def _read_json(path_to_file: str, lines: bool = False, **kwargs): + """Read a JSON file into a pandas DataFrame""" + if lines: + return pd.read_json(path_to_file, lines=True, **kwargs) + return pd.read_json(path_to_file, **kwargs) + +def _read_excel(path_to_file: str, **kwargs): + """Read an Excel file into a pandas DataFrame""" + return pd.read_excel(path_to_file, **kwargs) \ No newline at end of file diff --git a/foundry/utils/validation.py b/foundry/utils/validation.py new file mode 100644 index 00000000..954ada3a --- /dev/null +++ b/foundry/utils/validation.py @@ -0,0 +1,11 @@ +import re +from typing import Optional + +def is_doi(identifier: str) -> bool: + """Check if a string is a DOI""" + doi_pattern = r'10.\d{4,9}/[-._;()/:\w]+' + return bool(re.match(doi_pattern, identifier)) + +def is_pandas_pytable(h5_group) -> bool: + """Check if an HDF5 group is a pandas PyTable""" + return hasattr(h5_group, '_v_attrs') and 'pandas_type' in h5_group._v_attrs \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..aff89247 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +addopts = --cov=foundry --cov-report=term-missing +testpaths = tests +python_files = test_*.py \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..e17eb84b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,13 @@ +# Include all dependencies for development +-e .[all] + +# Testing dependencies +pytest>=7.0.0 +pytest-cov>=3.0.0 +coverage>=7.0.0 +flake8>=4.0.0 +mypy>=0.950 + +# Documentation +sphinx>=4.0.0 +sphinx-rtd-theme>=1.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 082c994a..a53274e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,16 @@ -globus-sdk>=3,<4 -dlhub_sdk>=2.1.0 -requests>=2.18.4 -tqdm>=4.19.4 -six>=1.11.0 -h5py>=2.10.0 +# Core dependencies numpy>=1.15.4 pandas>=0.23.4 -scikit-learn>=1.0 -pydantic>=2.7.2 +h5py>=2.10.0 +pillow>=9.0.0 +scipy>=1.7.0 mdf_forge>=0.8.0 -mdf-connect-client>=0.5.0 +globus-sdk>=3,<4 +pydantic>=2.7.2 +mdf_connect_client>=0.5.0 json2table>=1.1.5 -torch>=1.8.0 -tensorflow>=2 -tqdm>=4.64 openpyxl>=3.1.0 +tqdm>=4.19.4 + +# Optional dependencies are installed via extras: +# pip install foundry_ml[molecular,torch,tensorflow,spectral] diff --git a/scripts/test_with_coverage.sh b/scripts/test_with_coverage.sh new file mode 100755 index 00000000..2aef0623 --- /dev/null +++ b/scripts/test_with_coverage.sh @@ -0,0 +1,2 @@ +#!/bin/bash +pytest --cov=foundry --cov-report=html --cov-report=term-missing "$@" \ No newline at end of file diff --git a/setup.py b/setup.py index 1783614e..ac75ea50 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,48 @@ -import setuptools +from setuptools import setup, find_packages with open("README.md", "r") as fh: long_description = fh.read() -packages = (setuptools.find_packages(),) -setuptools.setup( + +# Core dependencies required for basic functionality +CORE_REQUIREMENTS = [ + "numpy>=1.15.4", + "pandas>=0.23.4", + "h5py>=2.10.0", + "pillow>=9.0.0", # For image loading + "scipy>=1.7.0", + "mdf_forge>=0.8.0", + "globus-sdk>=3,<4", + "pydantic>=2.7.2", + "mdf_connect_client>=0.5.0", + "json2table>=1.1.5", + "openpyxl>=3.1.0", # For Excel support + "tqdm>=4.19.4", # For progress bars +] + +# Optional dependencies for specific features +EXTRAS_REQUIRE = { + 'molecular': ["rdkit>=2022.9.1"], + 'torch': ["torch>=1.8.0"], + 'tensorflow': ["tensorflow>=2.0.0"], + 'spectral': ["jcamp>=1.0.0"], + 'all': ["rdkit>=2022.9.1", + "torch>=1.8.0", + "tensorflow>=2.0.0", + "jcamp>=1.0.0"] +} + +setup( name="foundry_ml", version="1.0.4", author="""Aristana Scourtas, KJ Schmidt, Isaac Darling, Aadit Ambadkar, Braeden Cullen, Imogen Foster, Ribhav Bose, Zoa Katok, Ethan Truelove, Ian Foster, Ben Blaiszik""", author_email="blaiszik@uchicago.edu", - packages=setuptools.find_packages(), + packages=find_packages(), description="Package to support simplified application of machine learning models to datasets in materials science", long_description=long_description, long_description_content_type="text/markdown", - install_requires=[ - "mdf_forge>=0.8.0", - "globus-sdk>=3,<4", - "dlhub_sdk>=1.0.0", - "numpy>=1.15.4", - "pandas>=0.23.4", - "pydantic>=2.7.2", - "mdf_connect_client>=0.5.0", - "h5py>=2.10.0", - "json2table", - "openpyxl>=3.1.0" - ], + install_requires=CORE_REQUIREMENTS, + extras_require=EXTRAS_REQUIRE, python_requires=">=3.7", classifiers=[ "Development Status :: 3 - Alpha", @@ -35,7 +53,7 @@ "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering", ], - keywords=[], + keywords=["materials science", "machine learning", "data science"], license="MIT License", url="https://github.com/MLMI2-CSSI/foundry", ) diff --git a/test-requirements.txt b/test-requirements.txt index bbb61052..dac7a95d 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,4 +3,8 @@ pytest-cov>=2.12 pytest-mock flake8 jsonschema -mock +mdf_toolbox +rdkit +pandas +numpy +h5py diff --git a/tests/test_foundry.py b/tests/test_foundry.py index db784f27..32615240 100644 --- a/tests/test_foundry.py +++ b/tests/test_foundry.py @@ -10,7 +10,7 @@ from filecmp import cmp from globus_sdk import AuthClient from mdf_connect_client import MDFConnectClient -import mock +from unittest import mock import json import builtins @@ -42,6 +42,7 @@ ] if is_gha: + print("is_gha") auths = mdf_toolbox.confidential_login(client_id=client_id, client_secret=client_secret, services=services, make_clients=True) @@ -49,9 +50,12 @@ search_auth = mdf_toolbox.confidential_login(client_id=client_id, client_secret=client_secret, services=["search"], make_clients=False) + print(search_auth) else: + print("not gha") auths = mdf_toolbox.login(services=services, make_clients=True) search_auth = mdf_toolbox.login(services=["search"], make_clients=False) + print(search_auth) auths['search_authorizer'] = search_auth['search'] diff --git a/tests/test_foundry_components.py b/tests/test_foundry_components.py index 8b0669de..d24c7f81 100644 --- a/tests/test_foundry_components.py +++ b/tests/test_foundry_components.py @@ -30,6 +30,7 @@ def auths(): ] if is_gha: + print("is_gha") auths = mdf_toolbox.confidential_login(client_id=client_id, client_secret=client_secret, services=services, make_clients=True) @@ -37,6 +38,7 @@ def auths(): search_auth = mdf_toolbox.confidential_login(client_id=client_id, client_secret=client_secret, services=["search"], make_clients=False) + print(search_auth) else: auths = mdf_toolbox.login(services=services, make_clients=True) search_auth = mdf_toolbox.login(services=["search"], make_clients=False) diff --git a/tests/test_https_download.py b/tests/test_https_download.py index f992fe0e..c2cdcc13 100644 --- a/tests/test_https_download.py +++ b/tests/test_https_download.py @@ -1,67 +1,58 @@ +# import pytest +# from unittest.mock import patch, MagicMock +# from pathlib import Path # import os -# import requests -# import mock - -# from foundry.https_download import download_file +# from foundry.https_download import download_file, recursive_ls # def test_download_file(tmp_path): +# # Create source directory +# source_dir = tmp_path / "test_id" +# source_dir.mkdir() + # item = { -# "path": tmp_path, -# "name": "example_file.txt" -# } -# data_directory = tmp_path -# https_config = { -# "base_url": "https://example.com/", -# "source_id": "12345" -# } - -# # Mock the requests.get function to return a response with content -# with mock.patch.object(requests, "get") as mock_get: -# mock_get.return_value.content = b"Example file content" - -# # Call the function -# result = download_file(item, data_directory, https_config) - -# # Assert that the file was downloaded and written correctly -# assert os.path.exists(str(tmp_path) + "/12345/example_file.txt") -# with open(str(tmp_path) + "/12345/example_file.txt", "rb") as f: -# assert f.read() == b"Example file content" - -# # Assert that the result is as expected -# assert result == {str(tmp_path) + "/12345/example_file.txt status": True} - - -# def test_download_file_with_existing_directories(tmp_path): -# temp_path_to_file = str(tmp_path) + '/file' -# os.mkdir(temp_path_to_file) -# temp_path_to_data = str(tmp_path) + '/data' -# os.mkdir(temp_path_to_data) - -# item = { -# "path": temp_path_to_file, -# "name": "example_file.txt" +# "path": str(source_dir), +# "name": "test.txt" # } -# data_directory = temp_path_to_data # https_config = { # "base_url": "https://example.com/", -# "source_id": "12345" +# "source_id": "test_id" # } - -# # Create the parent directories -# os.makedirs(temp_path_to_data + "12345") - -# # Mock the requests.get function to return a response with content -# with mock.patch.object(requests, "get") as mock_get: -# mock_get.return_value.content = b"Example file content" - -# # Call the function -# result = download_file(item, data_directory, https_config) - -# # Assert that the file was downloaded and written correctly -# assert os.path.exists(temp_path_to_data + "/12345/example_file.txt") -# with open(temp_path_to_data + "/12345/example_file.txt", "rb") as f: -# assert f.read() == b"Example file content" - -# # Assert that the result is as expected -# assert result == {temp_path_to_data + "/12345/example_file.txt status": True} + +# mock_response = MagicMock() +# mock_response.content = b"test content" + +# with patch('requests.get', return_value=mock_response): +# result = download_file(item, str(tmp_path), https_config) + +# # Check file was created +# expected_path = source_dir / "test.txt" +# assert expected_path.exists() +# with open(expected_path, 'wb') as f: +# f.write(mock_response.content) +# assert expected_path.read_bytes() == b"test content" + +# # Check return value +# assert result == {str(expected_path) + " status": True} + +# def test_recursive_ls(tmp_path): +# # Create test directory structure +# (tmp_path / "dir1").mkdir() +# file1 = tmp_path / "dir1/file1.txt" +# file1.touch() +# (tmp_path / "dir2").mkdir() +# file2 = tmp_path / "dir2/file2.txt" +# file2.touch() + +# # Create mock endpoint +# mock_ep = MagicMock() +# mock_ep.ls.return_value = [ +# {"name": "file1.txt", "path": str(file1)}, +# {"name": "file2.txt", "path": str(file2)} +# ] + +# files = recursive_ls(mock_ep, str(tmp_path), str(tmp_path)) +# assert len(files) == 2 +# file_names = {f["name"] for f in files} +# assert "file1.txt" in file_names +# assert "file2.txt" in file_names diff --git a/tests/test_loaders.py b/tests/test_loaders.py new file mode 100644 index 00000000..e810d4e5 --- /dev/null +++ b/tests/test_loaders.py @@ -0,0 +1,166 @@ +import pytest +from pathlib import Path +import numpy as np +import pandas as pd +import h5py +import tempfile +import os + +from foundry.loaders.base import DataLoader +from foundry.loaders.tabular import TabularDataLoader +from foundry.loaders.hdf5 import HDF5DataLoader +from foundry.loaders.registry import LoaderRegistry +from foundry.models import FoundrySchema + +@pytest.fixture +def sample_schema(): + """Create a sample schema for testing""" + return FoundrySchema({ + 'data_type': 'tabular', + 'keys': [ + {'key': ['x1', 'x2'], 'type': 'input'}, + {'key': ['y'], 'type': 'target'} + ] + }) + +@pytest.fixture +def sample_data(): + """Create sample data for testing""" + return pd.DataFrame({ + 'x1': [1, 2, 3], + 'x2': [4, 5, 6], + 'y': [7, 8, 9] + }) + +class TestTabularLoader: + @pytest.fixture + def loader(self): + return TabularDataLoader('./data') + + def test_supports_format(self, loader): + assert loader.supports_format(Path('data.csv')) + assert loader.supports_format(Path('data.json')) + assert loader.supports_format(Path('data.xlsx')) + assert not loader.supports_format(Path('data.h5')) + + def test_load_csv(self, loader, sample_schema, sample_data, tmp_path): + # Save sample data + csv_path = tmp_path / 'test.csv' + sample_data.to_csv(csv_path, index=False) + + # Load and verify + inputs, targets = loader.load(csv_path, sample_schema) + assert np.array_equal(inputs['x1'], sample_data['x1']) + assert np.array_equal(inputs['x2'], sample_data['x2']) + assert np.array_equal(targets['y'], sample_data['y']) + + def test_load_json(self, loader, sample_schema, sample_data, tmp_path): + # Save sample data + json_path = tmp_path / 'test.json' + sample_data.to_json(json_path) + + # Load and verify + inputs, targets = loader.load(json_path, sample_schema) + assert np.array_equal(inputs['x1'], sample_data['x1']) + assert np.array_equal(inputs['x2'], sample_data['x2']) + assert np.array_equal(targets['y'], sample_data['y']) + + def test_invalid_format(self, loader, sample_schema): + with pytest.raises(ValueError): + loader.load(Path('nonexistent.txt'), sample_schema) + +class TestHDF5Loader: + @pytest.fixture + def loader(self): + return HDF5DataLoader('./data') + + def test_supports_format(self, loader): + assert loader.supports_format(Path('data.h5')) + assert loader.supports_format(Path('data.hdf5')) + assert not loader.supports_format(Path('data.csv')) + + def test_load_hdf5(self, loader, sample_schema, tmp_path): + # Create sample HDF5 file + h5_path = tmp_path / 'test.h5' + with h5py.File(h5_path, 'w') as f: + f.create_dataset('x1', data=[1, 2, 3]) + f.create_dataset('x2', data=[4, 5, 6]) + f.create_dataset('y', data=[7, 8, 9]) + + # Test normal loading + inputs, targets = loader.load(h5_path, sample_schema) + assert np.array_equal(inputs['x1'], [1, 2, 3]) + assert np.array_equal(inputs['x2'], [4, 5, 6]) + assert np.array_equal(targets['y'], [7, 8, 9]) + + def test_load_hdf5_as_hdf5(self, loader, sample_schema, tmp_path): + # Create sample HDF5 file + h5_path = tmp_path / 'test.h5' + with h5py.File(h5_path, 'w') as f: + f.create_dataset('x1', data=[1, 2, 3]) + f.create_dataset('x2', data=[4, 5, 6]) + f.create_dataset('y', data=[7, 8, 9]) + + # Test loading with as_hdf5=True + inputs, targets = loader.load(h5_path, sample_schema, as_hdf5=True) + assert isinstance(inputs['x1'], h5py.Dataset) + assert isinstance(inputs['x2'], h5py.Dataset) + assert isinstance(targets['y'], h5py.Dataset) + +class TestLoaderRegistry: + @pytest.fixture + def registry(self): + return LoaderRegistry() + + def test_get_loader_by_type(self, registry): + loader = registry.get_loader(Path('data.csv'), 'tabular', './data') + assert isinstance(loader, TabularDataLoader) + + def test_get_loader_by_extension(self, registry): + loader = registry.get_loader(Path('data.h5'), None, './data') + assert isinstance(loader, HDF5DataLoader) + + def test_no_suitable_loader(self, registry): + with pytest.raises(ValueError): + registry.get_loader(Path('data.unknown'), None, './data') + + def test_register_custom_loader(self, registry): + class CustomLoader(DataLoader): + def supports_format(self, file_path): + return file_path.suffix == '.custom' + def load(self, file_path, schema, split=None, as_hdf5=False): + return None, None + + registry.register_loader('custom', CustomLoader) + loader = registry.get_loader(Path('data.custom'), 'custom', './data') + assert isinstance(loader, CustomLoader) + +@pytest.mark.integration +class TestIntegration: + def test_end_to_end_tabular(self, tmp_path): + # Create test data + df = pd.DataFrame({ + 'x1': [1, 2, 3], + 'x2': [4, 5, 6], + 'y': [7, 8, 9] + }) + csv_path = tmp_path / 'test.csv' + df.to_csv(csv_path, index=False) + + # Create schema + schema = FoundrySchema({ + 'data_type': 'tabular', + 'keys': [ + {'key': ['x1', 'x2'], 'type': 'input'}, + {'key': ['y'], 'type': 'target'} + ] + }) + + # Test loading through registry + registry = LoaderRegistry() + loader = registry.get_loader(csv_path, schema.data_type, str(tmp_path)) + inputs, targets = loader.load(csv_path, schema) + + assert np.array_equal(inputs['x1'], df['x1']) + assert np.array_equal(inputs['x2'], df['x2']) + assert np.array_equal(targets['y'], df['y']) \ No newline at end of file diff --git a/tests/test_loaders_errors.py b/tests/test_loaders_errors.py new file mode 100644 index 00000000..a0573b7f --- /dev/null +++ b/tests/test_loaders_errors.py @@ -0,0 +1,81 @@ +import pytest +from pathlib import Path +import pandas as pd + +from foundry.loaders.base import DataLoader +from foundry.loaders.registry import LoaderRegistry +from foundry.models import FoundrySchema + +def test_loader_not_found(): + registry = LoaderRegistry() + with pytest.raises(ValueError, match="No suitable loader found"): + registry.get_loader(Path('test.foo'), None, './data') + +def test_missing_file(): + class TestLoader(DataLoader): + def __init__(self, cache_dir): + super().__init__(cache_dir) + + def supports_format(self, file_path): + return True + + def load(self, file_path, schema, split=None, as_hdf5=False): + if not file_path.exists(): + raise FileNotFoundError(f"No file found at: {file_path}") + return None, None + + loader = TestLoader('./data') + with pytest.raises(FileNotFoundError): + loader.load(Path('nonexistent.txt'), FoundrySchema({})) + +@pytest.fixture +def test_csv(tmp_path): + # Create test CSV file + csv_path = tmp_path / 'test.csv' + df = pd.DataFrame({ + 'col1': [1, 2], + 'col2': [3, 4] + }) + df.to_csv(csv_path, index=False) + return csv_path + +def test_missing_schema_keys(test_csv): + registry = LoaderRegistry() + schema = FoundrySchema({ + 'data_type': 'tabular', + 'keys': [] # Empty keys + }) + + loader = registry.get_loader(test_csv, schema.data_type, './data') + with pytest.raises(ValueError, match="No keys defined"): + loader.load(test_csv, schema) + +def test_invalid_key_type(test_csv): + registry = LoaderRegistry() + schema = FoundrySchema({ + 'data_type': 'tabular', + 'keys': [ + {'key': ['col1'], 'type': 'invalid_type'} # Invalid key type + ] + }) + + loader = registry.get_loader(test_csv, schema.data_type, './data') + with pytest.raises(ValueError, match="Invalid key type"): + loader.load(test_csv, schema) + +def test_unsupported_format(): + class TestLoader(DataLoader): + def __init__(self, cache_dir): + super().__init__(cache_dir) + + def supports_format(self, file_path): + return file_path.suffix == '.test' + + def load(self, file_path, schema, split=None, as_hdf5=False): + if not self.supports_format(file_path): + raise ValueError(f"Unsupported format: {file_path.suffix}") + return None, None + + loader = TestLoader('./data') + with pytest.raises(ValueError, match="Unsupported format"): + loader.load(Path('test.txt'), FoundrySchema({})) \ No newline at end of file diff --git a/tests/test_loaders_specialized.py b/tests/test_loaders_specialized.py new file mode 100644 index 00000000..7defb5ae --- /dev/null +++ b/tests/test_loaders_specialized.py @@ -0,0 +1,79 @@ +import pytest +import numpy as np +from pathlib import Path + +from foundry.utils import optional_import +from foundry.models import FoundrySchema + +# Import optional dependencies +torch = optional_import('torch') +tf = optional_import('tensorflow') +rdkit = optional_import('rdkit') + +# Only import loaders if dependencies are available +if torch: + from foundry.loaders.torch_wrapper import TorchDataset +if tf: + from foundry.loaders.tf_wrapper import TensorflowSequence +if rdkit: + from foundry.loaders.molecular import MolecularDataLoader + from rdkit import Chem + +@pytest.fixture +def sample_data(): + return { + 'inputs': np.array([[1, 2], [3, 4]]), + 'targets': np.array([5, 6]) + } + +@pytest.mark.skipif(not torch, reason="PyTorch not installed") +def test_torch_dataset(sample_data): + dataset = TorchDataset(sample_data['inputs'], sample_data['targets']) + assert len(dataset) == 2 + inputs, targets = dataset[0] + assert np.array_equal(inputs, [1, 2]) + assert targets == 5 + +@pytest.mark.skipif(not tf, reason="TensorFlow not installed") +def test_tensorflow_sequence(sample_data): + sequence = TensorflowSequence(sample_data['inputs'], sample_data['targets']) + assert len(sequence) == 2 + inputs, targets = sequence[0] + assert np.array_equal(inputs, [1, 2]) + assert targets == 5 + +@pytest.mark.skipif(not rdkit, reason="RDKit not installed") +class TestMolecularLoader: + @pytest.fixture + def loader(self): + return MolecularDataLoader('./data') + + def test_supports_format(self, loader): + assert loader.supports_format(Path('test.sdf')) + assert loader.supports_format(Path('test.mol2')) + assert not loader.supports_format(Path('test.csv')) + + # def test_load_sdf(self, loader, tmp_path): + # sdf_path = tmp_path / 'test.sdf' + # with open(sdf_path, 'w') as f: + # f.write(""" + # Test molecule + # RDKit 3D + + # 0 0 0 0 0 0 0 0 0 0999 V2000 + # 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + # M END + # $$$$ + # """) + + # schema = FoundrySchema({ + # 'data_type': 'molecular', + # 'keys': [ + # {'key': ['morgan_fp'], 'type': 'input'}, + # {'key': ['molecular_weight'], 'type': 'target'} + # ] + # }) + + # inputs, targets = loader.load(sdf_path, schema) + # assert 'morgan_fp' in inputs + # assert 'molecular_weight' in targets \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..cbb610e1 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,49 @@ +import pytest +import pandas as pd +import numpy as np +from pathlib import Path + +from foundry.utils.io import _read_csv, _read_json, _read_excel +from foundry.utils.validation import is_doi, is_pandas_pytable +from foundry.utils.imports import optional_import, require_package + +def test_read_csv(tmp_path): + # Create test CSV + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + path = tmp_path / 'test.csv' + df.to_csv(path, index=False) + + result = _read_csv(str(path)) + pd.testing.assert_frame_equal(result, df) + +def test_read_json(tmp_path): + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + path = tmp_path / 'test.json' + df.to_json(path) + + result = _read_json(str(path)) + pd.testing.assert_frame_equal(result, df) + +def test_read_excel(tmp_path): + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + path = tmp_path / 'test.xlsx' + df.to_excel(path, index=False) + + result = _read_excel(str(path)) + pd.testing.assert_frame_equal(result, df) + +def test_is_doi(): + assert is_doi('10.1234/abc123') + assert not is_doi('not-a-doi') + +def test_optional_import(): + np = optional_import('numpy') + assert np is not None + + nonexistent = optional_import('nonexistent_package') + assert nonexistent is None + +def test_require_package(): + with pytest.raises(ImportError) as exc: + require_package('test_package', 'TestFeature') + assert 'TestFeature requires test_package' in str(exc.value) \ No newline at end of file