-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathjson2cif.py
executable file
·126 lines (101 loc) · 4.13 KB
/
json2cif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json
import numpy as np
from ase import Atoms
from ase.io import write
import os
import re
from collections import defaultdict
import argparse
def decode_ndarray(obj):
"""Custom decoder for handling numpy arrays stored in JSON."""
if isinstance(obj, dict) and '__ndarray__' in obj:
shape, dtype, data = obj['__ndarray__']
return np.array(data, dtype=dtype).reshape(shape)
return obj
def json_line_to_atoms(json_line):
"""Convert a single JSON line to an ASE Atoms object and return the formula_pretty."""
entry = json.loads(json_line, object_hook=decode_ndarray)
formula_pretty = entry.get("formula_pretty", "unknown")
if "structure" in entry:
structure = entry["structure"]
cell = structure["lattice"]["matrix"]
positions = [site["xyz"] for site in structure["sites"]]
symbols = [site["label"] for site in structure["sites"]]
atoms = Atoms(symbols=symbols, positions=positions, cell=cell, pbc=True)
return atoms, formula_pretty
return None, formula_pretty
def normalize_formula(formula):
def parse_formula(formula):
pattern = re.compile(r'([A-Za-z][a-z]?)(\d*)')
stack = []
current = []
i = 0
while i < len(formula):
if formula[i] == '(':
stack.append(current)
current = []
i += 1
elif formula[i] == ')':
i += 1
count = ''
while i < len(formula) and formula[i].isdigit():
count += formula[i]
i += 1
count = int(count or 1)
previous = stack.pop()
for elem, num in current:
previous.append((elem, num * count))
current = previous
else:
match = pattern.match(formula[i:])
if match:
element = match.group(1)
count = int(match.group(2) or 1)
current.append((element, count))
i += len(match.group(0))
else:
i += 1
return current
parsed = parse_formula(formula)
element_count = defaultdict(int)
for element, count in parsed:
element_count[element] += count
order = []
seen = set()
for element, _ in parsed:
if element not in seen:
seen.add(element)
order.append(element)
return ''.join(f"{el}{element_count[el] if element_count[el] > 1 else ''}" for el in order)
def get_unique_filename(output_dir, base_name, count):
if count == 0:
return os.path.join(output_dir, f"{base_name}.cif")
else:
return os.path.join(output_dir, f"{base_name}-{count + 1}.cif")
def convert_json_lines_to_cifs(json_file, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
formula_count = defaultdict(int)
with open(json_file, 'r') as file:
lines = file.readlines()
for i, line in enumerate(lines):
try:
atoms, formula_pretty = json_line_to_atoms(line)
if atoms is not None:
safe_formula_pretty = normalize_formula(formula_pretty)
count = formula_count[safe_formula_pretty]
cif_file = get_unique_filename(output_dir, safe_formula_pretty, count)
write(cif_file, atoms)
formula_count[safe_formula_pretty] += 1
else:
print(f"No valid structure found in line {i + 1}.")
except json.JSONDecodeError as e:
print(f"JSON decoding error in line {i + 1}: {e}")
def main():
parser = argparse.ArgumentParser(description='Convert JSON lines to CIF files.')
parser.add_argument('--file', type=str, required=True, help='Path to the JSON file.')
parser.add_argument('--outdir', type=str, default='CIF', help='Output directory for CIF files.')
args = parser.parse_args()
convert_json_lines_to_cifs(args.file, args.outdir)
if __name__ == '__main__':
main()