Merge pull request #2385 from keflavich/issue2375

Fix molecule parsing issue for CDMS
astropy · May 6, 2022 · 729fd4d · 729fd4d
2 parents ad6cb80 + 777d169
commit 729fd4d
Show file tree

Hide file tree

Showing 8 changed files with 265 additions and 61 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -32,6 +32,11 @@ jplsbdb
 - Fix a bug for jplsdbd query when the returned physical quantity contains
   a unit with exponential. [#2377]
 
+linelists.cdms
+^^^^^^^^^^^^^^
+
+- Fix issues with the line name parser and the line data parser; the original
+  implementation was incomplete. [#2385]
 
 Infrastructure, Utility and Other Changes and Additions
 -------------------------------------------------------

diff --git a/astroquery/linelists/cdms/core.py b/astroquery/linelists/cdms/core.py
@@ -10,9 +10,10 @@
 from astroquery.utils import async_to_sync
 # import configurable items declared in __init__.py
 from astroquery.linelists.cdms import conf
-from astroquery.jplspec import lookup_table
 from astroquery.exceptions import InvalidQueryError, EmptyResponseError
 
+import re
+import string
 
 __all__ = ['CDMS', 'CDMSClass']
 
@@ -52,6 +53,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
         molecule : list, string of regex if parse_name_locally=True, optional
             Identifiers of the molecules to search for. If this parameter
             is not provided the search will match any species. Default is 'All'.
+            As a first pass, the molecule will be searched for with a direct
+            string match.  If no string match is found, a regular expression
+            match is attempted.  Note that if the molecule name regex contains
+            parentheses, they must be escaped.  For example, 'H2C(CN)2.*' must be
+            specified as 'H2C\\(CN\\)2.*'  (but because of the first-attempt
+            full-string matching, 'H2C(CN)2' will match that molecule
+            successfully).
 
         temperature_for_intensity : float
             The temperature to use when computing the intensity Smu^2.  Set
@@ -126,12 +134,12 @@ def query_lines_async(self, min_frequency, max_frequency, *,
             if parse_name_locally:
                 self.lookup_ids = build_lookup()
                 luts = self.lookup_ids.find(molecule, flags)
-                payload['Molecules'] = tuple(f"{val:06d} {key}"
-                                             for key, val in luts.items())[0]
-                if len(molecule) == 0:
+                if len(luts) == 0:
                     raise InvalidQueryError('No matching species found. Please '
                                             'refine your search or read the Docs '
                                             'for pointers on how to search.')
+                payload['Molecules'] = tuple(f"{val:06d} {key}"
+                                             for key, val in luts.items())[0]
             else:
                 payload['Molecules'] = molecule
 
@@ -187,12 +195,14 @@ def _parse_result(self, response, verbose=False):
 
         ELO:   Lower state energy in cm^{-1} relative to the ground state.
         GUP:   Upper state degeneracy.
-        TAG:   Species tag or molecular identifier.
-            A negative value flags that the line frequency has
-            been measured in the laboratory.  The absolute value of TAG is then the
-            species tag and ERR is the reported experimental error.  The three most
-            significant digits of the species tag are coded as the mass number of
-            the species.
+        MOLWT: The first half of the molecular weight tag, which is the mass in atomic
+               mass units (Daltons).
+        TAG:   Species tag or molecular identifier.  This only includes the
+               last 3 digits of the CDMS tag
+
+        A negative value of MOLWT flags that the line frequency has been
+        measured in the laboratory.  We record this boolean in the 'Lab'
+        column.  ERR is the reported experimental error.
 
         QNFMT: Identifies the format of the quantum numbers
         Ju/Ku/vu and Jl/Kl/vl are the upper/lower QNs
@@ -215,15 +225,21 @@ def _parse_result(self, response, verbose=False):
                   'DR': 36,
                   'ELO': 38,
                   'GUP': 48,
-                  'TAG': 51,
-                  'QNFMT': 57,
+                  'MOLWT': 51,
+                  'TAG': 54,
+                  'QNFMT': 58,
                   'Ju': 61,
                   'Ku': 63,
                   'vu': 65,
-                  'Jl': 67,
-                  'Kl': 69,
-                  'vl': 71,
-                  'F': 73,
+                  'F1u': 67,
+                  'F2u': 69,
+                  'F3u': 71,
+                  'Jl': 73,
+                  'Kl': 75,
+                  'vl': 77,
+                  'F1l': 79,
+                  'F2l': 81,
+                  'F3l': 83,
                   'name': 89}
 
         result = ascii.read(text, header_start=None, data_start=0,
@@ -235,6 +251,18 @@ def _parse_result(self, response, verbose=False):
         result['FREQ'].unit = u.MHz
         result['ERR'].unit = u.MHz
 
+        result['Lab'] = result['MOLWT'] < 0
+        result['MOLWT'] = np.abs(result['MOLWT'])
+        result['MOLWT'].unit = u.Da
+
+        for suf in 'ul':
+            for qn in ('J', 'v', 'K', 'F1', 'F2', 'F3'):
+                qnind = qn+suf
+                if result[qnind].dtype != int:
+                    intcol = np.array(list(map(parse_letternumber, result[qnind])),
+                                      dtype=int)
+                    result[qnind] = intcol
+
         # if there is a crash at this step, something went wrong with the query
         # and the _last_query_temperature was not set.  This shouldn't ever
         # happen, but, well, I anticipate it will.
@@ -303,12 +331,66 @@ def tryfloat(x):
 CDMS = CDMSClass()
 
 
+def parse_letternumber(st):
+    """
+    Parse CDMS's two-letter QNs
+
+    From the CDMS docs:
+    "Exactly two characters are available for each quantum number. Therefore, half
+    integer quanta are rounded up ! In addition, capital letters are used to
+    indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
+    types are used to signal corresponding negative quantum numbers."
+    """
+    asc = string.ascii_lowercase
+    ASC = string.ascii_uppercase
+    newst = ''.join(['-' + str(asc.index(x)+10) if x in asc else
+                     str(ASC.index(x)+10) if x in ASC else
+                     x for x in st])
+    return int(newst)
+
+
+class Lookuptable(dict):
+
+    def find(self, st, flags):
+        """
+        Search dictionary keys for a regex match to string s
+
+        Parameters
+        ----------
+        s : str
+            String to compile as a regular expression
+            Can be entered non-specific for broader results
+            ('H2O' yields 'H2O' but will also yield 'HCCCH2OD')
+            or as the specific desired regular expression for
+            catered results, for example: ('H20$' yields only 'H2O')
+
+        flags : int
+            Regular expression flags.
+
+        Returns
+        -------
+        The list of values corresponding to the matches
+
+        """
+
+        out = {}
+
+        for kk, vv in self.items():
+            # note that the string-match attempt here differs from the jplspec
+            # implementation
+            match = (st in kk) or re.search(st, str(kk), flags=flags)
+            if match:
+                out[kk] = vv
+
+        return out
+
+
 def build_lookup():
 
     result = CDMS.get_species_table()
     keys = list(result[1][:])  # convert NAME column to list
     values = list(result[0][:])  # convert TAG column to list
     dictionary = dict(zip(keys, values))  # make k,v dictionary
-    lookuptable = lookup_table.Lookuptable(dictionary)  # apply the class above
+    lookuptable = Lookuptable(dictionary)  # apply the class above
 
     return lookuptable
diff --git a/astroquery/linelists/cdms/setup_package.py b/astroquery/linelists/cdms/setup_package.py
@@ -6,7 +6,10 @@
 
 def get_package_data():
 
-    paths_test = [os.path.join('data', 'CO.data')]
+    paths_test = [os.path.join('data', 'CO.data'),
+                  os.path.join('data', 'HC7S.data'),
+                  os.path.join('data', 'post_response.html'),
+                  ]
     paths_data = [os.path.join('data', 'catdir.cat')]
 
     return {'astroquery.linelists.cdms.tests': paths_test,

diff --git a/astroquery/linelists/cdms/tests/data/HC7S.data b/astroquery/linelists/cdms/tests/data/HC7S.data
@@ -0,0 +1,16 @@
+<!DOCTYPE html
+	PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+	 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US">
+<head>
+<title>Untitled Document</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
+</head>
+<body>
+<pre>
+  100694.0650     0.4909    -3.9202 2  210.7681255 117501 224C6-1C7C7    C5 1C6C6        HC7S
+  100694.0675     0.4909    -3.9237 2  210.7682253 117501 224C6-1C7C6    C5 1C6C5        HC7S
+  100696.6906     0.4909    -3.9202 2  210.7790255 117501 224C6 1C7C7    C5-1C6C6        HC7S
+  100696.6933     0.4909    -3.9237 2  210.7789253 117501 224C6 1C7C6    C5-1C6C5        HC7S
+  100755.6075     0.4921    -4.0157 2  255.1740253 117501 224C7-1C7C6    C6 1C6C5        HC7S
+</pre></body></html>
diff --git a/astroquery/linelists/cdms/tests/data/post_response.html b/astroquery/linelists/cdms/tests/data/post_response.html
@@ -0,0 +1,9 @@
+<html><head>
+</head>
+<frameset rows="60,*" bordercolor="#ffffff">
+<frame src="/classic/predictions/cdmstabhead.html" border=0 frameborder=0 framespacing=0 marginheight=30 marginwidth=16 scrolling=no>
+<frame src="/classic/predictions/cdmscache/cdmstab{replace}.html" border=0 frameborder=0 framespacing=0 marginheight=0 marginwidth=12 scrolling=yes>
+</frameset>
+<body>Sorry, your browser does not support frames!<br>
+<a href="/classic/predictions/cdmscache/cdmstab{replace}.html">tabular</a></body>
+</html>
diff --git a/astroquery/linelists/cdms/tests/test_cdms.py b/astroquery/linelists/cdms/tests/test_cdms.py
@@ -1,10 +1,17 @@
 import numpy as np
+import pytest
 
 import os
+import requests
 
 from astropy import units as u
 from astropy.table import Table
-from astroquery.linelists.cdms import CDMS
+from astroquery.linelists.cdms.core import CDMS, parse_letternumber
+from astroquery.utils.mocks import MockResponse
+
+colname_set = set(['FREQ', 'ERR', 'LGINT', 'DR', 'ELO', 'GUP', 'TAG', 'QNFMT',
+                   'Ju', 'Jl', "vu", "F1u", "F2u", "F3u", "vl", "Ku", "Kl",
+                   "F1l", "F2l", "F3l", "name", "MOLWT", "Lab"])
 
 
 def data_path(filename):
@@ -13,26 +20,36 @@ def data_path(filename):
     return os.path.join(data_dir, filename)
 
 
-class MockResponseSpec:
+def mockreturn(*args, method='GET', data={}, url='', **kwargs):
+    if method == 'GET':
+        molecule = url.split('cdmstab')[1].split('.')[0]
+        with open(data_path(molecule+".data"), 'rb') as fh:
+            content = fh.read()
+        return MockResponse(content=content)
+    elif method == 'POST':
+        molecule = dict(data)['Molecules']
+        with open(data_path("post_response.html"), 'r') as fh:
+            content = fh.read().format(replace=molecule).encode()
+        return MockResponse(content=content)
+
 
-    def __init__(self, filename):
-        self.filename = data_path(filename)
+@pytest.fixture
+def patch_post(request):
+    mp = request.getfixturevalue("monkeypatch")
 
-    @property
-    def text(self):
-        with open(self.filename) as f:
-            return f.read()
+    mp.setattr(CDMS, '_request', mockreturn)
+    return mp
 
 
 def test_input_async():
 
     response = CDMS.query_lines_async(min_frequency=100 * u.GHz,
                                       max_frequency=1000 * u.GHz,
                                       min_strength=-500,
-                                      molecule="028001 CO",
+                                      molecule="028503 CO, v=0",
                                       get_query_payload=True)
     response = dict(response)
-    assert response['Molecules'] == "028001 CO"
+    assert response['Molecules'] == "028503 CO, v=0"
     np.testing.assert_almost_equal(response['MinNu'], 100.)
     np.testing.assert_almost_equal(response['MaxNu'], 1000.)
 
@@ -51,15 +68,61 @@ def test_input_multi():
     np.testing.assert_almost_equal(response['MaxNu'], 1000.)
 
 
-def test_query():
+def test_query(patch_post):
 
-    response = MockResponseSpec('CO.data')
-    tbl = CDMS._parse_result(response)
+    tbl = CDMS.query_lines(min_frequency=100 * u.GHz,
+                           max_frequency=1000 * u.GHz,
+                           min_strength=-500,
+                           molecule="CO")
     assert isinstance(tbl, Table)
     assert len(tbl) == 8
-    assert set(tbl.keys()) == set(['FREQ', 'ERR', 'LGINT', 'DR', 'ELO', 'GUP',
-                                   'TAG', 'QNFMT', 'Ju', 'Jl', "vu", "vl", "Ku", "Kl", "F", "name"])
+    assert set(tbl.keys()) == colname_set
 
     assert tbl['FREQ'][0] == 115271.2018
     assert tbl['ERR'][0] == .0005
     assert tbl['LGINT'][0] == -7.1425
+
+
+def test_parseletternumber():
+    """
+    Very Important:
+    Exactly two characters are available for each quantum number. Therefore, half
+    integer quanta are rounded up ! In addition, capital letters are used to
+    indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
+    types are used to signal corresponding negative quantum numbers.
+    """
+
+    # examples from the docs
+    assert parse_letternumber("A0") == 100
+    assert parse_letternumber("Z9") == 359
+
+    # inferred?
+    assert parse_letternumber("z9") == -359
+    assert parse_letternumber("ZZ") == 3535
+
+
+def test_hc7s(patch_post):
+    """
+    Test for a very complicated molecule
+
+    CDMS.query_lines_async(100*u.GHz, 100.755608*u.GHz, molecule='HC7S', parse_name_locally=True)
+    """
+
+    tbl = CDMS.query_lines(100*u.GHz, 100.755608*u.GHz, molecule='HC7S',)
+    assert isinstance(tbl, Table)
+    assert len(tbl) == 5
+    assert set(tbl.keys()) == colname_set
+
+    assert tbl['FREQ'][0] == 100694.065
+    assert tbl['ERR'][0] == 0.4909
+    assert tbl['LGINT'][0] == -3.9202
+    assert tbl['MOLWT'][0] == 117
+
+    assert tbl['Ju'][0] == 126
+    assert tbl['Jl'][0] == 125
+    assert tbl['vu'][0] == 127
+    assert tbl['vl'][0] == 126
+    assert tbl['Ku'][0] == -1
+    assert tbl['Kl'][0] == 1
+    assert tbl['F1u'][0] == 127
+    assert tbl['F1l'][0] == 126