Skip to content

Commit

Permalink
Improve the parsing of list of quantities, and allow the parsing of p…
Browse files Browse the repository at this point in the history
…ercentage values.

For example

     'Melting point: 75% -17.5 °C; 80% 4.6 °C; 85% 21 °C.' will now return
        [Quantity('75%'), Quantity('-17.5 degC')],
        [Quantity("80%"), Quantity('4.6 degC')],
        [Quantity("85%"), Quantity('21 degC')],
     Where previously it returned just `Quantity("4.6 degC")`.

All other examples continue to work the way they did before. But strings such as "10%" will also work now.
  • Loading branch information
bramp committed Jul 1, 2024
1 parent b4c2ce1 commit 1a990ae
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 5 deletions.
1 change: 0 additions & 1 deletion src/unit_parse/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def __init__(self):
["cu m", "m**3"], # pint gets confused
["cu cm", "cm**3"], # pint gets confused
["cu mm", "mm**3"], # pint gets confused
["[0-9]{1,5} ?%", ""]
]

self.pre_proc_split = [";"]
Expand Down
32 changes: 30 additions & 2 deletions src/unit_parse/pre_processing_multiple.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from unit_parse.config import config
from unit_parse.logger import log_debug, log_info
from unit_parse.utils import remove_empty_str
from unit_parse.utils import contains_number, remove_empty_str


@log_info
Expand Down Expand Up @@ -52,6 +52,33 @@ def multiple_quantities(text_in: str, sep: list[str]) -> List[str]:
result = re.split(sep, text_in)
return [text.strip() for text in result]

def split_on_quantities(text_in: str) -> list[str]:
"""
Split the string into a list of strings, where each string contains a single quantity.
Examples
--------
'18 mm Hg @ 68 °F' --> ['18 mm Hg @', '68 °F']
'Melting point: 75% -17.5 °C' --> ['Melting point: 75%', '-17.5 °C']
'Pass me a 300 ml beer.' --> ['Pass me a 300 ml beer.']
"""
# Use regular expression to split the input text into possible groups of quantities
# The pattern looks for spaces (\s) followed by a digit [-]?(\d)
# The positive lookahead (?=...) ensures that the split happens without
# consuming the digit
quantities = re.split(r'(\s+)(?=[-]?\d)', text_in)

# This regex will sometimes produce groups of just text, so merge subsequent groups until
# each group contains a number. This could be done in a more complex regex,
# but a loop is pretty simple.
results = []
for result in quantities:
if results and not contains_number(results[-1]):
results[-1] = results[-1] + result
else:
results.append(result)
return results


@log_debug
def condition_finder(text_in: str) -> List[str]:
Expand Down Expand Up @@ -89,7 +116,8 @@ def condition_finder(text_in: str) -> List[str]:
result = re.split("@", text)
out2 += [t.strip() for t in result]
else:
out2.append(text)
result = split_on_quantities(text)
out2 += result

return [text.strip() for text in out2]

Expand Down
7 changes: 6 additions & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
['40°F', Quantity('40 degF')],
['20.80 mmHg', Quantity('20.80 mmHg')],
['20.80 mm Hg', Quantity('20.80 mmHg')], # correcting a unit that pint gets wrong
['10%', Quantity('10 %')],

# scientific notation
["15*10**2 s", Quantity("15*10**2 s")], # standard
Expand Down Expand Up @@ -76,7 +77,11 @@
[[Quantity('18 mmHg'), Quantity('68 degF')], [Quantity('20 mmHg'), Quantity('77 degF')]]],
["Low threshold= 13.1150 mg/cu m; High threshold= 26840 mg/cu m; Irritating concn= 22875 mg/cu m.",
Quantity('22875 mg/m**3')],
['Melting point: 75% -17.5 °C; 80% 4.6 °C; 85% 21 °C.', Quantity("4.6 degC")],
['Melting point: 75% -17.5 °C; 80% 4.6 °C; 85% 21 °C.', [
[Quantity('75%'), Quantity('-17.5 degC')],
[Quantity("80%"), Quantity('4.6 degC')],
[Quantity("85%"), Quantity('21 degC')],
]],

# ranges
['115.2-115.3 °C', Quantity('115.2 degC')],
Expand Down
3 changes: 2 additions & 1 deletion tests/test_pre_processing_multiple.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def test_reduce_parenthesis(input_, output_):
['18 mm Hg @ 68 °F ', ['18 mm Hg', '68 °F']],
['20 mm Hg @ 77° F', ['20 mm Hg', '77° F']],
[' 20 mm Hg @ 77° F (NTP, 1992)', ['20 mm Hg', '77° F', 'NTP, 1992']],
['Melting point: 75% -17.5 °C', ['Melting point: 75%', '-17.5 °C']],
['20.8 mm Hg 25 °C', ['20.8 mm Hg', '25 °C']],

['40 °F (4 °C) (Closed cup)', ['40 °F', '4 °C', 'Closed cup']],
['40 °F (4 °C)', ['40 °F', '4 °C']],
Expand All @@ -45,7 +47,6 @@ def test_reduce_parenthesis(input_, output_):
['(4 °C Closed cup)', ['4 °C Closed cup']],

# negative control (fails)
['20.8 mm Hg 25 °C', ['20.8 mm Hg 25 °C']],
['20.8 mm Hgat25 °C', ['20.8 mm Hgat25 °C']],
['Pass me a 300 ml beer.', ['Pass me a 300 ml beer.']],
["42.3 gcm-3", ["42.3 gcm-3"]],
Expand Down

0 comments on commit 1a990ae

Please sign in to comment.