files for creating list of all possible short formulae

NSoiffer · Dec 16, 2022 · ac77ab0 · ac77ab0
1 parent 72f766b
commit ac77ab0
Show file tree

Hide file tree

Showing 3 changed files with 14,918 additions and 0 deletions.
diff --git a/PythonScripts/chem_formula_from_wikipedia.py b/PythonScripts/chem_formula_from_wikipedia.py
@@ -0,0 +1,86 @@
+from bs4 import BeautifulSoup
+import re
+
+SingleLetterSingleElementFormulae = re.compile("^(?P<single>[A-Z](_\d)?(\^\d?[+\-−])?)$")
+SingleLetterDoubleElementFormulae = re.compile("^(?P<first>[A-Z](_\d)?(\d?[+\-−])?)(?P<second>[A-Z](_\d)?(\^\d?[+\-−])?)$")
+
+def create_formulae_from_wikipedia_page(in_file: str, out_file):
+    with open(in_file, encoding='utf8') as _in_stream:
+        with open(out_file, 'w', encoding='utf8') as out_stream:
+            file_contents = BeautifulSoup(_in_stream, features="html.parser")
+            formulaeSet = set()
+            for row in file_contents.find_all('tr'):
+                cols = row.find_all('td')
+                if len(cols) == 3 and cols[0].get_text():
+                    result = add_formula_to_set(formulaeSet, cols[0].decode_contents())
+
+            result = ''
+            for formula in sorted(formulaeSet):
+                entry = '"{}", '.format(formula)
+                if len(result) + len(entry) > 78:
+                    out_stream.write(result + '\n')
+                    result = ''
+                else:
+                    result += entry
+
+            if len(result) > 0:
+                out_stream.write(result)
+
+
+def add_formula_to_set(formulaeSet, data):
+    # the data isn't clean -- do some cleanup
+    data = data.replace("<sub>", "_").replace("</sub>", "").replace("<sup>", "^").replace("</sup>", "")
+    data = data.strip()
+    oneElement = SingleLetterSingleElementFormulae.match(data)
+    if not(oneElement is None):
+        formulaeSet.add(oneElement.group("single"))
+    else:
+        twoElements = SingleLetterDoubleElementFormulae.match(data)
+        if not(twoElements is None):
+            formulaeSet.add(twoElements.group("first") + twoElements.group("second"))
+            formulaeSet.add(twoElements.group("second") + twoElements.group("first"))
+
+
+def create_ions_from_wikipedia_page(in_file: str, out_file):
+    with open(in_file, encoding='utf8') as _in_stream:
+        with open(out_file, 'w', encoding='utf8') as out_stream:
+            file_contents = BeautifulSoup(_in_stream, features="html.parser")
+            formulaeSet = set()
+            for ion in file_contents.find_all(class_= 'chemf'):
+                result = add_ion_to_set(formulaeSet, ion.decode_contents())
+
+            result = ''
+            for formula in sorted(formulaeSet):
+                entry = '"{}", '.format(formula)
+                if len(result) + len(entry) > 79:
+                    out_stream.write(result + '\n')
+                    result = ''
+                else:
+                    result += entry
+
+            if len(result) > 0:
+                out_stream.write(result)
+
+BothScripts = re.compile('([^<]+)<span class="template-chem2-su"><span>(\d?[+−])</span><span>(\d)</span></span>')
+def add_ion_to_set(formulaeSet, data):
+    # the data isn't clean -- do some cleanup
+    data = data.replace('<sub>', "_").replace('<sub class="template-chem2-sub">', "_").replace('</sub>', "") \
+                .replace('<sup>', "^").replace('<sup class="template-chem2-sup">', "^").replace('</sup>', "")
+    bothScripts = BothScripts.match(data)
+    if not(bothScripts is None):
+        data = "{}_{}^{}".format(bothScripts.group(1), bothScripts.group(3), bothScripts.group(2))
+    else:
+        bothScripts = BothScripts.match(data)
+
+    data = data.strip()
+    oneElement = SingleLetterSingleElementFormulae.match(data)
+    if not(oneElement is None):
+        formulaeSet.add(oneElement.group("single"))
+    else:
+        twoElements = SingleLetterDoubleElementFormulae.match(data)
+        if not(twoElements is None):
+            formulaeSet.add(twoElements.group("first") + twoElements.group("second"))
+            formulaeSet.add(twoElements.group("second") + twoElements.group("first"))
+
+create_formulae_from_wikipedia_page("wikipedia-chemical_formulae.html", "chem_formula.txt")
+create_ions_from_wikipedia_page("wikipedia-ions.html", "chem_ions.txt")