Skip to content

Commit

Permalink
fix linting problems
Browse files Browse the repository at this point in the history
  • Loading branch information
NSoiffer committed May 1, 2024
1 parent 0b56007 commit 621fbc4
Showing 1 changed file with 20 additions and 21 deletions.
41 changes: 20 additions & 21 deletions PythonScripts/chem_formula_from_wikipedia.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from bs4 import BeautifulSoup
import re

SingleLetterSingleElementFormulae = re.compile("^(?P<single>[A-Z](_\d)?(\^\d?[+\-−])?)$")
SingleLetterDoubleElementFormulae = re.compile("^(?P<first>[A-Z](_\d)?(\d?[+\-−])?)(?P<second>[A-Z](_\d)?(\^\d?[+\-−])?)$")
SingleLetterSingleElementFormulae = re.compile(r"^(?P<single>[A-Z](_\d)?(\^\d?[+\-−])?)$")
SingleLetterDoubleElementFormulae = re.compile(r"^(?P<first>[A-Z](_\d)?(\d?[+\-−])?)(?P<second>[A-Z](_\d)?(\^\d?[+\-−])?)$")


def create_formulae_from_wikipedia_page(in_file: str, out_file):
with open(in_file, encoding='utf8') as _in_stream:
Expand Down Expand Up @@ -32,11 +33,11 @@ def add_formula_to_set(formulaeSet, data):
data = data.replace("<sub>", "_").replace("</sub>", "").replace("<sup>", "^").replace("</sup>", "")
data = data.strip()
oneElement = SingleLetterSingleElementFormulae.match(data)
if not(oneElement is None):
if not (oneElement is None):
formulaeSet.add(oneElement.group("single"))
else:
twoElements = SingleLetterDoubleElementFormulae.match(data)
if not(twoElements is None):
if not (twoElements is None):
formulaeSet.add(twoElements.group("first") + twoElements.group("second"))
formulaeSet.add(twoElements.group("second") + twoElements.group("first"))

Expand All @@ -46,8 +47,8 @@ def create_ions_from_wikipedia_page(in_file: str, out_file):
with open(out_file, 'w', encoding='utf8') as out_stream:
file_contents = BeautifulSoup(_in_stream, features="html.parser")
formulaeSet = set()
for ion in file_contents.find_all(class_= 'chemf'):
result = add_ion_to_set(formulaeSet, ion.decode_contents())
for ion in file_contents.find_all(class_='chemf'):
result = add_single_letter_ions_to_set(formulaeSet, ion.decode_contents())

result = ''
for formula in sorted(formulaeSet):
Expand All @@ -61,26 +62,24 @@ def create_ions_from_wikipedia_page(in_file: str, out_file):
if len(result) > 0:
out_stream.write(result)

BothScripts = re.compile('([^<]+)<span class="template-chem2-su"><span>(\d?[+−])</span><span>(\d)</span></span>')
def add_ion_to_set(formulaeSet, data):

BOTH_SCRIPTS = re.compile(r'([^<]+)<span class="template-chem2-su"><span>(\d?[+−])</span><span>(\d)</span></span>')


def add_single_letter_ions_to_set(formulaeSet, data):
# the data isn't clean -- do some cleanup
data = data.replace('<sub>', "_").replace('<sub class="template-chem2-sub">', "_").replace('</sub>', "") \
.replace('<sup>', "^").replace('<sup class="template-chem2-sup">', "^").replace('</sup>', "")
bothScripts = BothScripts.match(data)
if not(bothScripts is None):
bothScripts = BOTH_SCRIPTS.match(data)
if not (bothScripts is None):
data = "{}_{}^{}".format(bothScripts.group(1), bothScripts.group(3), bothScripts.group(2))
else:
bothScripts = BothScripts.match(data)
bothScripts = BOTH_SCRIPTS.match(data)

data = data.strip()
oneElement = SingleLetterSingleElementFormulae.match(data)
if not(oneElement is None):
formulaeSet.add(oneElement.group("single"))
else:
twoElements = SingleLetterDoubleElementFormulae.match(data)
if not(twoElements is None):
formulaeSet.add(twoElements.group("first") + twoElements.group("second"))
formulaeSet.add(twoElements.group("second") + twoElements.group("first"))
if not (any(ch.islower() for ch in data)):
formulaeSet.add(data)


create_formulae_from_wikipedia_page("wikipedia-chemical_formulae.html", "chem_formula.txt")
create_ions_from_wikipedia_page("wikipedia-ions.html", "chem_ions.txt")
# create_formulae_from_wikipedia_page("wikipedia-chemical_formulae.html", "chem_formula.txt")
create_ions_from_wikipedia_page("wikipedia-ions.html", "chem_ions.txt")

0 comments on commit 621fbc4

Please sign in to comment.