Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FEFF schema #41

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions aimmdb/_tests/ingest/test_feff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import copy
import pandas as pd
from pathlib import Path

from aimmdb.ingest import load_feff_data


DATA_PATH = Path("aimmdb/_tests/data/feff/65272_C_007")


def test_load_feff_data():

data, metadata = load_feff_data(DATA_PATH)

assert isinstance(data, pd.DataFrame)
assert isinstance(metadata, dict)
assert isinstance(metadata["feff.inp"], str)
assert isinstance(metadata["feff.out"], str)
assert isinstance(metadata["xmu.dat-comments"], str)


def copy_feff_data():

data, metadata = load_feff_data(DATA_PATH)

data_copy = copy.deepcopy(data)

assert data.equals(data_copy)

metadata_copy = copy.deepcopy(metadata)

assert metadata.equals(metadata_copy)

assert isinstance(data_copy, pd.DataFrame)
assert isinstance(metadata_copy, dict)
assert isinstance(metadata_copy["feff.inp"], str)
assert isinstance(metadata_copy["feff.out"], str)
assert isinstance(metadata_copy["xmu.dat-comments"], str)
Empty file added aimmdb/ingest/__init__.py
Empty file.
File renamed without changes.
8 changes: 0 additions & 8 deletions aimmdb/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,6 @@ class BatteryChargeMetadataInternal(pydantic.BaseModel):
class BatteryChargeMetadata(pydantic.BaseModel, extra=pydantic.Extra.allow):
charge: BatteryChargeMetadataInternal

class FEFFpotentials(pydantic.BaseModel, extra=pydantic.Extra.allow):
x: Optional[str]
ipot: int
Z: str
element: int
l_scmt: int
l_fms: int


class FEFFcards(pydantic.BaseModel, extra=pydantic.Extra.allow):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as my previous comment:

This is too fine-grained. We can simply remove it for now. All of this is going into input_script.

atoms: float
Expand Down
407 changes: 0 additions & 407 deletions ingest/ingest_FEFF.ipynb

This file was deleted.

File renamed without changes.
170 changes: 170 additions & 0 deletions notebooks/ingest_FEFF.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"from pathlib import Path\n",
"\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tiled.client import from_uri\n",
"client = from_uri(\"http://localhost:8000/api\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"\n",
"DATA_PATH = Path(\"aimmdb/data/feff/65272_C_007\")\n",
"print(\"Data Path:\", DATA_PATH)\n",
"\n",
"contents = os.listdir(DATA_PATH)\n",
"print(\"Contents:\", contents)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from load_FEFF_Data import load_feff_data\n",
"\n",
"data, metadata = load_feff_data(DATA_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def ingest_feff(client, df, verbose=False):\n",
" \"\"\"\n",
" Upload the FEFF dataset to database\n",
" \"\"\"\n",
"\n",
" for (name, prep), g in df.groupby([\"sample.name\", \"sample.prep\"]):\n",
" if verbose:\n",
" print(f\"{name}: {prep}, {len(g)}\")\n",
"\n",
" sample_id = client.write_sample({\"name\" : name, \"prep\" : prep})\n",
"\n",
" for i, row in g.iterrows():\n",
" feff_df, _ = read_dat(row.file)\n",
" metadata = row.metadata\n",
" metadata[\"dataset\"] = \"feff\"\n",
" metadata[\"sample_id\"] = sample_id\n",
" client[\"uid\"].write_dataframe(feff_df, metadata=metadata, specs=[\"FEFF\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"starting ingestion...\")\n",
"ingest_feff(client, feff, verbose=True)\n",
"print(\"finished.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"FEFFInputMetadata.schema()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"FEFFOutputMetadata.schema()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# we will enforce that XAS metadata satisfies the following schema\n",
"ExperimentalXASMetadata.schema()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client[\"uid\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# with the correct metadata we can write to the server\n",
"# NOTE this doesn't prevent you from writing garbage but does help\n",
"df = pd.DataFrame({\"a\" : np.random.rand(100), \"b\" : np.random.rand(100)})\n",
"metadata = {\"dataset\" : \"feff\", \"foo\" : \"bar\", \"element\" : {\"symbol\" : \"Au\", \"edge\" : \"K\"}, \"facility\" : {\"name\" : \"ALS\"}, \"beamline\" : {\"name\" : \"8.0.1\"}}\n",
"node = client[\"uid\"].write_dataframe(df, metadata=metadata, specs=[\"FEFF\"])\n",
"node"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4 ('my_pymatgen')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "8cf392b7cd98023928c855fd79964086ca343b5f82a42ebb28f5e83ba8cfe45c"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "3d0110f7-ba1d-4817-95e0-1a9ec70770a0",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -34,10 +34,47 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "1ff6333b-690f-4119-a2b2-bbe5c94c3112",
"metadata": {},
"outputs": [],
"outputs": [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of the notebooks we store in GitHub should have their outputs stripped. This is because often outputs can inadvertently contain images or just lots of text, and this can be very space-intensive. There are a variety of ways to do this, including by using the command palette in Jupyter or by doing it from the command line, e.g. this.

{
"data": {
"text/plain": [
"{'title': 'ExperimentalXASMetadata',\n",
" 'type': 'object',\n",
" 'properties': {'element': {'$ref': '#/definitions/XDIElement'},\n",
" 'measurement_type': {'default': 'xas',\n",
" 'allOf': [{'$ref': '#/definitions/MeasurementEnum'}]},\n",
" 'dataset': {'title': 'Dataset', 'type': 'string'},\n",
" 'sample_id': {'title': 'Sample Id', 'type': 'string'},\n",
" 'facility': {'$ref': '#/definitions/FacilityMetadata'},\n",
" 'beamline': {'$ref': '#/definitions/BeamlineMetadata'}},\n",
" 'required': ['element', 'dataset', 'facility', 'beamline'],\n",
" 'definitions': {'XDIElement': {'title': 'XDIElement',\n",
" 'type': 'object',\n",
" 'properties': {'symbol': {'title': 'Symbol', 'type': 'string'},\n",
" 'edge': {'title': 'Edge', 'type': 'string'}},\n",
" 'required': ['symbol', 'edge']},\n",
" 'MeasurementEnum': {'title': 'MeasurementEnum',\n",
" 'description': 'An enumeration.',\n",
" 'enum': ['xas', 'rixs'],\n",
" 'type': 'string'},\n",
" 'FacilityMetadata': {'title': 'FacilityMetadata',\n",
" 'type': 'object',\n",
" 'properties': {'name': {'title': 'Name', 'type': 'string'}},\n",
" 'required': ['name']},\n",
" 'BeamlineMetadata': {'title': 'BeamlineMetadata',\n",
" 'type': 'object',\n",
" 'properties': {'name': {'title': 'Name', 'type': 'string'}},\n",
" 'required': ['name']}}}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we will enforce that XAS metadata satisfies the following schema\n",
"ExperimentalXASMetadata.schema()"
Expand All @@ -56,10 +93,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "cbf5e6a8-77b8-4c1d-80f5-f773cbcc5681",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"ExperimentalXASMetadata(element=XDIElement(symbol='Fe', edge='K'), measurement_type='xas', dataset='example', sample_id=None, facility=FacilityMetadata(name=None), beamline=BeamlineMetadata(name='8.0.1'))"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# example of valid metadata\n",
"metadata = {\n",
Expand Down Expand Up @@ -1056,9 +1104,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:aimm]",
"display_name": "Python 3.9.13 ('aimm')",
"language": "python",
"name": "conda-env-aimm-py"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1070,7 +1118,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.9.13"
},
"vscode": {
"interpreter": {
"hash": "189d756eac8438d33e11f8e23aa09bdc4c99760ed11a3bfefa464e31dcca4c4a"
}
}
},
"nbformat": 4,
Expand Down