Skip to content

Commit

Permalink
Improve ID generation scheme (#55)
Browse files Browse the repository at this point in the history
* Define new scheme for ID generation and set `immutable_id` and `last_modified`

* Finalize new ID generation scheme and update tests

* Linting fixes

* adapt unique id generation; add extra test

---------

Co-authored-by: Kristjan Eimre <[email protected]>
  • Loading branch information
ml-evs and eimrek authored May 30, 2024
1 parent e7043fd commit ec20a8d
Show file tree
Hide file tree
Showing 27 changed files with 693 additions and 12 deletions.
2 changes: 1 addition & 1 deletion examples/bzipped_pymatgen/.testing/first_entry.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "agm003188153", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["Ac"], "nelements": 1, "elements_ratios": [1.0], "chemical_formula_descriptive": "Ac4", "chemical_formula_reduced": "Ac", "chemical_formula_hill": null, "chemical_formula_anonymous": "A", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[3.92252914, 0.0, 0.0], [-1.96126457, 3.39700988, 0.0], [0.0, 0.0, 12.69499244]], "cartesian_site_positions": [[0.0, 0.0, 0.0], [-1.961264582666056e-08, 2.2646732646567, 3.17374811], [0.0, 0.0, 6.34749622], [1.9612645896126455, 1.1323366153433003, 9.52124433]], "nsites": 4, "species": [{"name": "Ac", "chemical_symbols": ["Ac"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Ac", "Ac", "Ac", "Ac"], "assemblies": null, "structure_features": [], "mat_id": "agm003188153", "prototype_id": "A_22_spg194", "location": "database/batch-000/Ac/Ac/xxx_02s-00_agm003188153_spg194", "formula": "Ac", "spg": 194, "stress": [[0.8811667, 0.0, 0.0], [0.0, 0.8811667, 0.0], [0.0, 0.0, -0.06628691]], "energy_total": -18.01479609, "total_mag": 0.0001825, "band_gap_ind": 0.0, "band_gap_dir": 0.0056, "dos_ef": 6.1911793, "energy_corrected": -18.014795, "e_above_hull": 0.0, "e_form": 0.0, "e_phase_separation": 0.0, "decomposition": " Ac ", "energy": null, "hull_distance": true, "formation_energy": true, "space_group_number": true}, "relationships": null}
{"id": "agm003188153", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": "agm003188153", "last_modified": "1970", "elements": ["Ac"], "nelements": 1, "elements_ratios": [1.0], "chemical_formula_descriptive": "Ac4", "chemical_formula_reduced": "Ac", "chemical_formula_hill": null, "chemical_formula_anonymous": "A", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[3.92252914, 0.0, 0.0], [-1.96126457, 3.39700988, 0.0], [0.0, 0.0, 12.69499244]], "cartesian_site_positions": [[0.0, 0.0, 0.0], [-1.961264582666056e-08, 2.2646732646567, 3.17374811], [0.0, 0.0, 6.34749622], [1.9612645896126455, 1.1323366153433003, 9.52124433]], "nsites": 4, "species": [{"name": "Ac", "chemical_symbols": ["Ac"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Ac", "Ac", "Ac", "Ac"], "assemblies": null, "structure_features": [], "mat_id": "agm003188153", "prototype_id": "A_22_spg194", "location": "database/batch-000/Ac/Ac/xxx_02s-00_agm003188153_spg194", "formula": "Ac", "spg": 194, "stress": [[0.8811667, 0.0, 0.0], [0.0, 0.8811667, 0.0], [0.0, 0.0, -0.06628691]], "energy_total": -18.01479609, "total_mag": 0.0001825, "band_gap_ind": 0.0, "band_gap_dir": 0.0056, "dos_ef": 6.1911793, "energy_corrected": -18.014795, "e_above_hull": 0.0, "e_form": 0.0, "e_phase_separation": 0.0, "decomposition": " Ac ", "energy": null, "hull_distance": true, "formation_energy": true, "space_group_number": true}, "relationships": null}
1 change: 0 additions & 1 deletion examples/folder_of_cifs/.testing/first_entry.json

This file was deleted.

6 changes: 3 additions & 3 deletions examples/xyz_files_no_compression/.testing/first_entry.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"id": "H_1.xyz",
"id": "H_1",
"type": "structures",
"links": null,
"meta": null,
"attributes": {
"immutable_id": null,
"last_modified": null,
"immutable_id": "H_1.xyz",
"last_modified": "1970",
"elements": [
"H"
],
Expand Down
1 change: 1 addition & 0 deletions examples/zip_of_cif/.testing/first_entry.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "55c564f6-ac6a-4122-b8d9-0ad9fe61e961", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": "structures.zip/structures/cifs/55c564f6-ac6a-4122-b8d9-0ad9fe61e961.cif", "last_modified": "1970", "elements": ["Ba", "C", "N", "S"], "nelements": 4, "elements_ratios": [0.14285714285714285, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857], "chemical_formula_descriptive": "C4Ba2N4S4", "chemical_formula_reduced": "BaC2N2S2", "chemical_formula_hill": null, "chemical_formula_anonymous": "A2B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[6.3587627540404945, 0.0, 0.0], [-2.672647488887009, 5.769819681958754, 0.0], [0.25844951934994664, -0.16511343006546234, 8.71190314896161]], "cartesian_site_positions": [[3.4987802863851005, 5.049341739457014, 6.533927361693402], [0.4457844982025419, 0.5553645123824795, 2.177975787264444], [0.37416734784252487, 2.642448780492868, 5.291889331690525], [2.6281157156591126, 1.202488098280357, 7.775965391700042], [3.5703974366609055, 2.962257471400425, 3.420013817271085], [1.316449068841148, 4.402218153614962, 0.9359377571616379], [0.3512600296798777, 4.156339511491648, 5.900755701251229], [4.011419364830723, 1.818004876809729, 7.167099022143105], [3.5933047547393455, 1.4483667404554434, 2.8111474477141454], [-0.06685458032729186, 3.786701375083563, 1.5448041268185058], [0.3773832460379156, 1.5350786992068879, 4.8597303327393915], [1.621957615426957, 0.7399681351855376, 8.208124390751108], [3.567181538468686, 4.069627552684378, 3.8521728163221476], [2.3226071690796486, 4.864738116705728, 0.5037787583104332]], "nsites": 14, "species": [{"name": "Ba", "chemical_symbols": ["Ba"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "C", "chemical_symbols": ["C"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "N", "chemical_symbols": ["N"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "S", "chemical_symbols": ["S"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Ba", "Ba", "C", "C", "C", "C", "S", "S", "S", "S", "N", "N", "N", "N"], "assemblies": null, "structure_features": [], "_mcloudarchive_property_b": 0.99, "_mcloudarchive_energy": -0.54, "_mcloudarchive_structure_description": "some description"}, "relationships": null}
File renamed without changes.
File renamed without changes.
4 changes: 4 additions & 0 deletions examples/zip_of_cif/data/data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
id,energy,Property B
55c564f6-ac6a-4122-b8d9-0ad9fe61e961,-0.54,0.99
cc1a41b1-a841-4818-baf1-a6c1441dc52a,-0.45,0.86
991bec7a-b3a8-49af-ba6d-be5afd685cd4,-0.55,1.01
4 changes: 4 additions & 0 deletions examples/zip_of_cif/data/data2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
id,structure_description
structures.zip/structures/cifs/55c564f6-ac6a-4122-b8d9-0ad9fe61e961.cif,"some description"
structures.zip/structures/cifs/cc1a41b1-a841-4818-baf1-a6c1441dc52a.cif,"describing something else"
structures.zip/structures/cifs/991bec7a-b3a8-49af-ba6d-be5afd685cd4.cif,null
27 changes: 27 additions & 0 deletions examples/zip_of_cif/data/refs.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
@article{Andersen2020,
title = {The {{OPTIMADE Specification}}},
author = {Andersen, Casper Welzel and Armiento, Rickard and Blokhin, Evgeny and Conduit, Gareth and Dwaraknath, Shyam and Evans, Matthew L and Fekete, {\'A}d{\'a}m and Gopakumar, Abhijith and Gra{\v z}ulis, Saulius and Merkys, Andrius and Mohamed, Fawzi and Oses, Corey and Pizzi, Giovanni and Rignanese, Gian-Marco and Scheidgen, Markus and Talirz, Leopold and Toher, Cormac and Winston, Donald},
year = {2020},
month = jul,
journal = {Zenodo},
doi = {10.5281/zenodo.4195050},
urldate = {2022-06-24},
langid = {english}
}

@article{Andersen2021,
title = {{{OPTIMADE}}, an {{API}} for exchanging materials data},
author = {Andersen, Casper W. and Armiento, Rickard and Blokhin, Evgeny and Conduit, Gareth J. and Dwaraknath, Shyam and Evans, Matthew L. and Fekete, {\'A}d{\'a}m and Gopakumar, Abhijith and Gra{\v z}ulis, Saulius and Merkys, Andrius and Mohamed, Fawzi and Oses, Corey and Pizzi, Giovanni and Rignanese, Gian-Marco and Scheidgen, Markus and Talirz, Leopold and Toher, Cormac and Winston, Donald and Aversa, Rossella and Choudhary, Kamal and Colinet, Pauline and Curtarolo, Stefano and Di Stefano, Davide and Draxl, Claudia and Er, Suleyman and Esters, Marco and Fornari, Marco and Giantomassi, Matteo and Govoni, Marco and Hautier, Geoffroy and Hegde, Vinay and Horton, Matthew K. and Huck, Patrick and Huhs, Georg and Hummelsh{\o}j, Jens and Kariryaa, Ankit and Kozinsky, Boris and Kumbhar, Snehal and Liu, Mohan and Marzari, Nicola and Morris, Andrew J. and Mostofi, Arash A. and Persson, Kristin A. and Petretto, Guido and Purcell, Thomas and Ricci, Francesco and Rose, Frisco and Scheffler, Matthias and Speckhard, Daniel and Uhrin, Martin and Vaitkus, Antanas and Villars, Pierre and Waroquiers, David and Wolverton, Chris and Wu, Michael and Yang, Xiaoyu},
year = {2021},
month = aug,
journal = {Scientific Data},
volume = {8},
number = {1},
pages = {217},
publisher = {{Nature Publishing Group}},
doi = {10.1038/s41597-021-00974-z},
urldate = {2021-08-12},
copyright = {2021 The Author(s)},
langid = {english},
annotation = {Bandiera\_abtest: a Cc\_license\_type: cc\_by Cg\_type: Nature Research Journals Primary\_atype: Research Subject\_term: Computational science;Condensed-matter physics;Theory and computation Subject\_term\_id: computational-science;condensed-matter-physics;theory-and-computation}
}
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

##########################################################################
# Crystallographic Information Format file
# Produced by PyCifRW module
#
# This is a CIF file. CIF has been adopted by the International
# Union of Crystallography as the standard for data archiving and
# transmission.
#
# For information on this file format, follow the CIF links at
# http://www.iucr.org
##########################################################################

data_0

loop_
_atom_site_label
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
_atom_site_type_symbol
Ba1 0.8965924582118168 0.8965924581978235 0.7499999999968082 Ba
Ba2 0.10340754179751976 0.10340754179284 0.25000000000275957 Ba
C1 0.23395238080264336 0.4753604312934071 0.6074320663586895 C
C2 0.47536043129806416 0.2339523807979867 0.892567933635359 C
C3 0.7660476191973566 0.524639568706593 0.39256793364131043 C
C4 0.5246395687019131 0.7660476192020365 0.10743206635317039 C
S1 0.3386310702634352 0.7397413705299642 0.6773210859161757 S
S2 0.7397413705252843 0.3386310702681149 0.8226789140783051 S
S3 0.6613689297272285 0.2602586294793724 0.32267891408425636 S
S4 0.26025862947471556 0.6613689297318851 0.17732108592169485 S
N1 0.15520998661097707 0.2820163252279592 0.5578264874671652 N
N2 0.2820163252326391 0.15520998660629753 0.9421735125383539 N
N3 0.8447900133890457 0.7179836747720179 0.44217351254430515 N
N4 0.717983674767384 0.8447900133936794 0.05782648747311654 N
_cell_angle_alpha 91.69894776135195
_cell_angle_beta 88.30105223864805
_cell_angle_gamma 114.8541200717058
_cell_length_a 6.3587627540404945
_cell_length_b 6.3587627540404945
_cell_length_c 8.717299758281083
loop_
_symmetry_equiv_pos_as_xyz
'x, y, z'
_symmetry_int_tables_number 1
_symmetry_space_group_name_H-M 'P 1'
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

##########################################################################
# Crystallographic Information Format file
# Produced by PyCifRW module
#
# This is a CIF file. CIF has been adopted by the International
# Union of Crystallography as the standard for data archiving and
# transmission.
#
# For information on this file format, follow the CIF links at
# http://www.iucr.org
##########################################################################

data_0

loop_
_atom_site_label
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
_atom_site_type_symbol
Sr1 0.0 0.0 0.0 Sr
C1 0.5 0.5 0.5 C
_cell_angle_alpha 60.00000000000001
_cell_angle_beta 60.00000000000001
_cell_angle_gamma 60.00000000000001
_cell_length_a 4.006498849786306
_cell_length_b 4.006498849786306
_cell_length_c 4.006498849786306
loop_
_symmetry_equiv_pos_as_xyz
'x, y, z'
_symmetry_int_tables_number 1
_symmetry_space_group_name_H-M 'P 1'
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

##########################################################################
# Crystallographic Information Format file
# Produced by PyCifRW module
#
# This is a CIF file. CIF has been adopted by the International
# Union of Crystallography as the standard for data archiving and
# transmission.
#
# For information on this file format, follow the CIF links at
# http://www.iucr.org
##########################################################################

data_0

loop_
_atom_site_label
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
_atom_site_type_symbol
Mg1 0.3230150293403171 0.17698497065439878 0.0 Mg
Mg2 0.8230150293456011 0.3230150293403171 0.0 Mg
Mg3 0.6769849706491146 0.8230150293456011 0.0 Mg
Mg4 0.17698497065439878 0.6769849706491146 0.0 Mg
Zn1 0.4999999999947159 0.4999999999947159 0.0 Zn
Zn2 0.0 0.0 0.0 Zn
B1 0.8727086545805982 0.6272913454141176 0.0 B
B2 0.3727086545858823 0.8727086545805982 0.0 B
B3 0.12729134541940176 0.3727086545858823 0.0 B
B4 0.6272913454141176 0.12729134541940176 0.0 B
Ir1 0.4299565041525537 0.7160135910847766 0.4999999999999829 Ir
Ir2 0.9299565041472696 0.7839864089099285 0.4999999999999829 Ir
Ir3 0.28398640891521276 0.4299565041525537 0.4999999999999829 Ir
Ir4 0.7839864089099285 0.07004349584215788 0.4999999999999829 Ir
Ir5 0.5700434958474356 0.28398640891521276 0.4999999999999829 Ir
Ir6 0.07004349584215788 0.21601359107949256 0.4999999999999829 Ir
Ir7 0.7160135910847766 0.5700434958474356 0.4999999999999829 Ir
Ir8 0.21601359107949256 0.9299565041472696 0.4999999999999829 Ir
Ir9 0.4999999999947159 0.0 0.4999999999999829 Ir
Ir10 0.0 0.4999999999947159 0.4999999999999829 Ir
_cell_angle_alpha 90.0
_cell_angle_beta 90.0
_cell_angle_gamma 90.0
_cell_length_a 9.4623270052342
_cell_length_b 9.4623270052342
_cell_length_c 2.9327245575729
loop_
_symmetry_equiv_pos_as_xyz
'x, y, z'
_symmetry_int_tables_number 1
_symmetry_space_group_name_H-M 'P 1'
1 change: 1 addition & 0 deletions examples/zip_of_cif_and_xyz/.testing/first_entry.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "set1/55c564f6-ac6a-4122-b8d9-0ad9fe61e961.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": "structures.zip/structures/set1/55c564f6-ac6a-4122-b8d9-0ad9fe61e961.cif", "last_modified": "2024-05-30T18:52:22.987047", "elements": ["Ba", "C", "N", "S"], "nelements": 4, "elements_ratios": [0.14285714285714285, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857], "chemical_formula_descriptive": "C4Ba2N4S4", "chemical_formula_reduced": "BaC2N2S2", "chemical_formula_hill": null, "chemical_formula_anonymous": "A2B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[6.3587627540404945, 0.0, 0.0], [-2.672647488887009, 5.769819681958754, 0.0], [0.25844951934994664, -0.16511343006546234, 8.71190314896161]], "cartesian_site_positions": [[3.4987802863851005, 5.049341739457014, 6.533927361693402], [0.4457844982025419, 0.5553645123824795, 2.177975787264444], [0.37416734784252487, 2.642448780492868, 5.291889331690525], [2.6281157156591126, 1.202488098280357, 7.775965391700042], [3.5703974366609055, 2.962257471400425, 3.420013817271085], [1.316449068841148, 4.402218153614962, 0.9359377571616379], [0.3512600296798777, 4.156339511491648, 5.900755701251229], [4.011419364830723, 1.818004876809729, 7.167099022143105], [3.5933047547393455, 1.4483667404554434, 2.8111474477141454], [-0.06685458032729186, 3.786701375083563, 1.5448041268185058], [0.3773832460379156, 1.5350786992068879, 4.8597303327393915], [1.621957615426957, 0.7399681351855376, 8.208124390751108], [3.567181538468686, 4.069627552684378, 3.8521728163221476], [2.3226071690796486, 4.864738116705728, 0.5037787583104332]], "nsites": 14, "species": [{"name": "Ba", "chemical_symbols": ["Ba"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "N", "chemical_symbols": ["N"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "S", "chemical_symbols": ["S"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "C", "chemical_symbols": ["C"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Ba", "Ba", "C", "C", "C", "C", "S", "S", "S", "S", "N", "N", "N", "N"], "assemblies": null, "structure_features": []}, "relationships": null}
Loading

0 comments on commit ec20a8d

Please sign in to comment.