Skip to content

Commit

Permalink
files for creating list of all possible short formulae
Browse files Browse the repository at this point in the history
  • Loading branch information
NSoiffer committed Dec 16, 2022
1 parent 72f766b commit ac77ab0
Show file tree
Hide file tree
Showing 3 changed files with 14,918 additions and 0 deletions.
86 changes: 86 additions & 0 deletions PythonScripts/chem_formula_from_wikipedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from bs4 import BeautifulSoup
import re

SingleLetterSingleElementFormulae = re.compile("^(?P<single>[A-Z](_\d)?(\^\d?[+\-−])?)$")
SingleLetterDoubleElementFormulae = re.compile("^(?P<first>[A-Z](_\d)?(\d?[+\-−])?)(?P<second>[A-Z](_\d)?(\^\d?[+\-−])?)$")

def create_formulae_from_wikipedia_page(in_file: str, out_file):
with open(in_file, encoding='utf8') as _in_stream:
with open(out_file, 'w', encoding='utf8') as out_stream:
file_contents = BeautifulSoup(_in_stream, features="html.parser")
formulaeSet = set()
for row in file_contents.find_all('tr'):
cols = row.find_all('td')
if len(cols) == 3 and cols[0].get_text():
result = add_formula_to_set(formulaeSet, cols[0].decode_contents())

result = ''
for formula in sorted(formulaeSet):
entry = '"{}", '.format(formula)
if len(result) + len(entry) > 78:
out_stream.write(result + '\n')
result = ''
else:
result += entry

if len(result) > 0:
out_stream.write(result)


def add_formula_to_set(formulaeSet, data):
# the data isn't clean -- do some cleanup
data = data.replace("<sub>", "_").replace("</sub>", "").replace("<sup>", "^").replace("</sup>", "")
data = data.strip()
oneElement = SingleLetterSingleElementFormulae.match(data)
if not(oneElement is None):
formulaeSet.add(oneElement.group("single"))
else:
twoElements = SingleLetterDoubleElementFormulae.match(data)
if not(twoElements is None):
formulaeSet.add(twoElements.group("first") + twoElements.group("second"))
formulaeSet.add(twoElements.group("second") + twoElements.group("first"))


def create_ions_from_wikipedia_page(in_file: str, out_file):
with open(in_file, encoding='utf8') as _in_stream:
with open(out_file, 'w', encoding='utf8') as out_stream:
file_contents = BeautifulSoup(_in_stream, features="html.parser")
formulaeSet = set()
for ion in file_contents.find_all(class_= 'chemf'):
result = add_ion_to_set(formulaeSet, ion.decode_contents())

result = ''
for formula in sorted(formulaeSet):
entry = '"{}", '.format(formula)
if len(result) + len(entry) > 79:
out_stream.write(result + '\n')
result = ''
else:
result += entry

if len(result) > 0:
out_stream.write(result)

BothScripts = re.compile('([^<]+)<span class="template-chem2-su"><span>(\d?[+−])</span><span>(\d)</span></span>')
def add_ion_to_set(formulaeSet, data):
# the data isn't clean -- do some cleanup
data = data.replace('<sub>', "_").replace('<sub class="template-chem2-sub">', "_").replace('</sub>', "") \
.replace('<sup>', "^").replace('<sup class="template-chem2-sup">', "^").replace('</sup>', "")
bothScripts = BothScripts.match(data)
if not(bothScripts is None):
data = "{}_{}^{}".format(bothScripts.group(1), bothScripts.group(3), bothScripts.group(2))
else:
bothScripts = BothScripts.match(data)

data = data.strip()
oneElement = SingleLetterSingleElementFormulae.match(data)
if not(oneElement is None):
formulaeSet.add(oneElement.group("single"))
else:
twoElements = SingleLetterDoubleElementFormulae.match(data)
if not(twoElements is None):
formulaeSet.add(twoElements.group("first") + twoElements.group("second"))
formulaeSet.add(twoElements.group("second") + twoElements.group("first"))

create_formulae_from_wikipedia_page("wikipedia-chemical_formulae.html", "chem_formula.txt")
create_ions_from_wikipedia_page("wikipedia-ions.html", "chem_ions.txt")
Loading

0 comments on commit ac77ab0

Please sign in to comment.