Skip to content

Commit

Permalink
Merge pull request #2385 from keflavich/issue2375
Browse files Browse the repository at this point in the history
Fix molecule parsing issue for CDMS
  • Loading branch information
bsipocz authored May 6, 2022
2 parents ad6cb80 + 777d169 commit 729fd4d
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 61 deletions.
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ jplsbdb
- Fix a bug for jplsdbd query when the returned physical quantity contains
a unit with exponential. [#2377]

linelists.cdms
^^^^^^^^^^^^^^

- Fix issues with the line name parser and the line data parser; the original
implementation was incomplete. [#2385]

Infrastructure, Utility and Other Changes and Additions
-------------------------------------------------------
Expand Down
116 changes: 99 additions & 17 deletions astroquery/linelists/cdms/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from astroquery.utils import async_to_sync
# import configurable items declared in __init__.py
from astroquery.linelists.cdms import conf
from astroquery.jplspec import lookup_table
from astroquery.exceptions import InvalidQueryError, EmptyResponseError

import re
import string

__all__ = ['CDMS', 'CDMSClass']

Expand Down Expand Up @@ -52,6 +53,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
molecule : list, string of regex if parse_name_locally=True, optional
Identifiers of the molecules to search for. If this parameter
is not provided the search will match any species. Default is 'All'.
As a first pass, the molecule will be searched for with a direct
string match. If no string match is found, a regular expression
match is attempted. Note that if the molecule name regex contains
parentheses, they must be escaped. For example, 'H2C(CN)2.*' must be
specified as 'H2C\\(CN\\)2.*' (but because of the first-attempt
full-string matching, 'H2C(CN)2' will match that molecule
successfully).
temperature_for_intensity : float
The temperature to use when computing the intensity Smu^2. Set
Expand Down Expand Up @@ -126,12 +134,12 @@ def query_lines_async(self, min_frequency, max_frequency, *,
if parse_name_locally:
self.lookup_ids = build_lookup()
luts = self.lookup_ids.find(molecule, flags)
payload['Molecules'] = tuple(f"{val:06d} {key}"
for key, val in luts.items())[0]
if len(molecule) == 0:
if len(luts) == 0:
raise InvalidQueryError('No matching species found. Please '
'refine your search or read the Docs '
'for pointers on how to search.')
payload['Molecules'] = tuple(f"{val:06d} {key}"
for key, val in luts.items())[0]
else:
payload['Molecules'] = molecule

Expand Down Expand Up @@ -187,12 +195,14 @@ def _parse_result(self, response, verbose=False):
ELO: Lower state energy in cm^{-1} relative to the ground state.
GUP: Upper state degeneracy.
TAG: Species tag or molecular identifier.
A negative value flags that the line frequency has
been measured in the laboratory. The absolute value of TAG is then the
species tag and ERR is the reported experimental error. The three most
significant digits of the species tag are coded as the mass number of
the species.
MOLWT: The first half of the molecular weight tag, which is the mass in atomic
mass units (Daltons).
TAG: Species tag or molecular identifier. This only includes the
last 3 digits of the CDMS tag
A negative value of MOLWT flags that the line frequency has been
measured in the laboratory. We record this boolean in the 'Lab'
column. ERR is the reported experimental error.
QNFMT: Identifies the format of the quantum numbers
Ju/Ku/vu and Jl/Kl/vl are the upper/lower QNs
Expand All @@ -215,15 +225,21 @@ def _parse_result(self, response, verbose=False):
'DR': 36,
'ELO': 38,
'GUP': 48,
'TAG': 51,
'QNFMT': 57,
'MOLWT': 51,
'TAG': 54,
'QNFMT': 58,
'Ju': 61,
'Ku': 63,
'vu': 65,
'Jl': 67,
'Kl': 69,
'vl': 71,
'F': 73,
'F1u': 67,
'F2u': 69,
'F3u': 71,
'Jl': 73,
'Kl': 75,
'vl': 77,
'F1l': 79,
'F2l': 81,
'F3l': 83,
'name': 89}

result = ascii.read(text, header_start=None, data_start=0,
Expand All @@ -235,6 +251,18 @@ def _parse_result(self, response, verbose=False):
result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz

result['Lab'] = result['MOLWT'] < 0
result['MOLWT'] = np.abs(result['MOLWT'])
result['MOLWT'].unit = u.Da

for suf in 'ul':
for qn in ('J', 'v', 'K', 'F1', 'F2', 'F3'):
qnind = qn+suf
if result[qnind].dtype != int:
intcol = np.array(list(map(parse_letternumber, result[qnind])),
dtype=int)
result[qnind] = intcol

# if there is a crash at this step, something went wrong with the query
# and the _last_query_temperature was not set. This shouldn't ever
# happen, but, well, I anticipate it will.
Expand Down Expand Up @@ -303,12 +331,66 @@ def tryfloat(x):
CDMS = CDMSClass()


def parse_letternumber(st):
"""
Parse CDMS's two-letter QNs
From the CDMS docs:
"Exactly two characters are available for each quantum number. Therefore, half
integer quanta are rounded up ! In addition, capital letters are used to
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
types are used to signal corresponding negative quantum numbers."
"""
asc = string.ascii_lowercase
ASC = string.ascii_uppercase
newst = ''.join(['-' + str(asc.index(x)+10) if x in asc else
str(ASC.index(x)+10) if x in ASC else
x for x in st])
return int(newst)


class Lookuptable(dict):

def find(self, st, flags):
"""
Search dictionary keys for a regex match to string s
Parameters
----------
s : str
String to compile as a regular expression
Can be entered non-specific for broader results
('H2O' yields 'H2O' but will also yield 'HCCCH2OD')
or as the specific desired regular expression for
catered results, for example: ('H20$' yields only 'H2O')
flags : int
Regular expression flags.
Returns
-------
The list of values corresponding to the matches
"""

out = {}

for kk, vv in self.items():
# note that the string-match attempt here differs from the jplspec
# implementation
match = (st in kk) or re.search(st, str(kk), flags=flags)
if match:
out[kk] = vv

return out


def build_lookup():

result = CDMS.get_species_table()
keys = list(result[1][:]) # convert NAME column to list
values = list(result[0][:]) # convert TAG column to list
dictionary = dict(zip(keys, values)) # make k,v dictionary
lookuptable = lookup_table.Lookuptable(dictionary) # apply the class above
lookuptable = Lookuptable(dictionary) # apply the class above

return lookuptable
5 changes: 4 additions & 1 deletion astroquery/linelists/cdms/setup_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

def get_package_data():

paths_test = [os.path.join('data', 'CO.data')]
paths_test = [os.path.join('data', 'CO.data'),
os.path.join('data', 'HC7S.data'),
os.path.join('data', 'post_response.html'),
]
paths_data = [os.path.join('data', 'catdir.cat')]

return {'astroquery.linelists.cdms.tests': paths_test,
Expand Down
16 changes: 16 additions & 0 deletions astroquery/linelists/cdms/tests/data/HC7S.data
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US">
<head>
<title>Untitled Document</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
</head>
<body>
<pre>
100694.0650 0.4909 -3.9202 2 210.7681255 117501 224C6-1C7C7 C5 1C6C6 HC7S
100694.0675 0.4909 -3.9237 2 210.7682253 117501 224C6-1C7C6 C5 1C6C5 HC7S
100696.6906 0.4909 -3.9202 2 210.7790255 117501 224C6 1C7C7 C5-1C6C6 HC7S
100696.6933 0.4909 -3.9237 2 210.7789253 117501 224C6 1C7C6 C5-1C6C5 HC7S
100755.6075 0.4921 -4.0157 2 255.1740253 117501 224C7-1C7C6 C6 1C6C5 HC7S
</pre></body></html>
9 changes: 9 additions & 0 deletions astroquery/linelists/cdms/tests/data/post_response.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<html><head>
</head>
<frameset rows="60,*" bordercolor="#ffffff">
<frame src="/classic/predictions/cdmstabhead.html" border=0 frameborder=0 framespacing=0 marginheight=30 marginwidth=16 scrolling=no>
<frame src="/classic/predictions/cdmscache/cdmstab{replace}.html" border=0 frameborder=0 framespacing=0 marginheight=0 marginwidth=12 scrolling=yes>
</frameset>
<body>Sorry, your browser does not support frames!<br>
<a href="/classic/predictions/cdmscache/cdmstab{replace}.html">tabular</a></body>
</html>
93 changes: 78 additions & 15 deletions astroquery/linelists/cdms/tests/test_cdms.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import numpy as np
import pytest

import os
import requests

from astropy import units as u
from astropy.table import Table
from astroquery.linelists.cdms import CDMS
from astroquery.linelists.cdms.core import CDMS, parse_letternumber
from astroquery.utils.mocks import MockResponse

colname_set = set(['FREQ', 'ERR', 'LGINT', 'DR', 'ELO', 'GUP', 'TAG', 'QNFMT',
'Ju', 'Jl', "vu", "F1u", "F2u", "F3u", "vl", "Ku", "Kl",
"F1l", "F2l", "F3l", "name", "MOLWT", "Lab"])


def data_path(filename):
Expand All @@ -13,26 +20,36 @@ def data_path(filename):
return os.path.join(data_dir, filename)


class MockResponseSpec:
def mockreturn(*args, method='GET', data={}, url='', **kwargs):
if method == 'GET':
molecule = url.split('cdmstab')[1].split('.')[0]
with open(data_path(molecule+".data"), 'rb') as fh:
content = fh.read()
return MockResponse(content=content)
elif method == 'POST':
molecule = dict(data)['Molecules']
with open(data_path("post_response.html"), 'r') as fh:
content = fh.read().format(replace=molecule).encode()
return MockResponse(content=content)


def __init__(self, filename):
self.filename = data_path(filename)
@pytest.fixture
def patch_post(request):
mp = request.getfixturevalue("monkeypatch")

@property
def text(self):
with open(self.filename) as f:
return f.read()
mp.setattr(CDMS, '_request', mockreturn)
return mp


def test_input_async():

response = CDMS.query_lines_async(min_frequency=100 * u.GHz,
max_frequency=1000 * u.GHz,
min_strength=-500,
molecule="028001 CO",
molecule="028503 CO, v=0",
get_query_payload=True)
response = dict(response)
assert response['Molecules'] == "028001 CO"
assert response['Molecules'] == "028503 CO, v=0"
np.testing.assert_almost_equal(response['MinNu'], 100.)
np.testing.assert_almost_equal(response['MaxNu'], 1000.)

Expand All @@ -51,15 +68,61 @@ def test_input_multi():
np.testing.assert_almost_equal(response['MaxNu'], 1000.)


def test_query():
def test_query(patch_post):

response = MockResponseSpec('CO.data')
tbl = CDMS._parse_result(response)
tbl = CDMS.query_lines(min_frequency=100 * u.GHz,
max_frequency=1000 * u.GHz,
min_strength=-500,
molecule="CO")
assert isinstance(tbl, Table)
assert len(tbl) == 8
assert set(tbl.keys()) == set(['FREQ', 'ERR', 'LGINT', 'DR', 'ELO', 'GUP',
'TAG', 'QNFMT', 'Ju', 'Jl', "vu", "vl", "Ku", "Kl", "F", "name"])
assert set(tbl.keys()) == colname_set

assert tbl['FREQ'][0] == 115271.2018
assert tbl['ERR'][0] == .0005
assert tbl['LGINT'][0] == -7.1425


def test_parseletternumber():
"""
Very Important:
Exactly two characters are available for each quantum number. Therefore, half
integer quanta are rounded up ! In addition, capital letters are used to
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
types are used to signal corresponding negative quantum numbers.
"""

# examples from the docs
assert parse_letternumber("A0") == 100
assert parse_letternumber("Z9") == 359

# inferred?
assert parse_letternumber("z9") == -359
assert parse_letternumber("ZZ") == 3535


def test_hc7s(patch_post):
"""
Test for a very complicated molecule
CDMS.query_lines_async(100*u.GHz, 100.755608*u.GHz, molecule='HC7S', parse_name_locally=True)
"""

tbl = CDMS.query_lines(100*u.GHz, 100.755608*u.GHz, molecule='HC7S',)
assert isinstance(tbl, Table)
assert len(tbl) == 5
assert set(tbl.keys()) == colname_set

assert tbl['FREQ'][0] == 100694.065
assert tbl['ERR'][0] == 0.4909
assert tbl['LGINT'][0] == -3.9202
assert tbl['MOLWT'][0] == 117

assert tbl['Ju'][0] == 126
assert tbl['Jl'][0] == 125
assert tbl['vu'][0] == 127
assert tbl['vl'][0] == 126
assert tbl['Ku'][0] == -1
assert tbl['Kl'][0] == 1
assert tbl['F1u'][0] == 127
assert tbl['F1l'][0] == 126
Loading

0 comments on commit 729fd4d

Please sign in to comment.