Skip to content

Commit

Permalink
Merge pull request #3064 from snbianco/ASB-27903-cloud-uris-from-query
Browse files Browse the repository at this point in the history
Streamlined method to get list of cloud URIs
  • Loading branch information
bsipocz authored Jul 16, 2024
2 parents 26050ed + a1349a1 commit cc25c37
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 63 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ mast

- Fix bug in ``Catalogs.query_criteria()`` to use ``page`` and ``pagesize`` parameters correctly. [#3065]

- Modify ``mast.Observations.get_cloud_uris`` to also accept query criteria and data product filters. [#3064]

- Increased the speed of ``mast.Observations.get_cloud_uris`` by obtaining multiple
URIs from MAST at once. [#3064]


0.4.7 (2024-03-08)
Expand Down
59 changes: 33 additions & 26 deletions astroquery/mast/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from astropy.utils.console import ProgressBarOrSpinner
from astropy.utils.exceptions import AstropyDeprecationWarning

from ..exceptions import NoResultsWarning, InvalidQueryError
from ..exceptions import NoResultsWarning

from . import utils

Expand Down Expand Up @@ -109,32 +109,14 @@ def get_cloud_uri(self, data_product, include_bucket=True, full_url=False):
found in the cloud, None is returned.
"""

s3_client = self.boto3.client('s3', config=self.config)

path = utils.mast_relative_path(data_product["dataURI"])
if path is None:
raise InvalidQueryError("Malformed data uri {}".format(data_product['dataURI']))
uri_list = self.get_cloud_uri_list(data_product, include_bucket=include_bucket, full_url=full_url)

if 'galex' in path:
path = path.lstrip("/mast/")
elif '/ps1/' in path:
path = path.replace("/ps1/", "panstarrs/ps1/public/")
# Making sure we got at least 1 URI from the query above.
if not uri_list or uri_list[0] is None:
warnings.warn("Unable to locate file {}.".format(data_product), NoResultsWarning)
else:
path = path.lstrip("/")

try:
s3_client.head_object(Bucket=self.pubdata_bucket, Key=path)
if include_bucket:
path = "s3://{}/{}".format(self.pubdata_bucket, path)
elif full_url:
path = "http://s3.amazonaws.com/{}/{}".format(self.pubdata_bucket, path)
return path
except self.botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] != "404":
raise

warnings.warn("Unable to locate file {}.".format(data_product['productFilename']), NoResultsWarning)
return None
# Output from ``get_cloud_uri_list`` is always a list even when it's only 1 URI
return uri_list[0]

def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False):
"""
Expand All @@ -158,8 +140,33 @@ def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False)
List of URIs generated from the data products, list way contain entries that are None
if data_products includes products not found in the cloud.
"""
s3_client = self.boto3.client('s3', config=self.config)

return [self.get_cloud_uri(product, include_bucket, full_url) for product in data_products]
paths = utils.mast_relative_path(data_products["dataURI"])
if isinstance(paths, str): # Handle the case where only one product was requested
paths = [paths]

uri_list = []
for path in paths:
if path is None:
uri_list.append(None)
else:
try:
# Use `head_object` to verify that the product is available on S3 (not all products are)
s3_client.head_object(Bucket=self.pubdata_bucket, Key=path)
if include_bucket:
s3_path = "s3://{}/{}".format(self.pubdata_bucket, path)
uri_list.append(s3_path)
elif full_url:
path = "http://s3.amazonaws.com/{}/{}".format(self.pubdata_bucket, path)
uri_list.append(path)
except self.botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] != "404":
raise
warnings.warn("Unable to locate file {}.".format(path), NoResultsWarning)
uri_list.append(None)

return uri_list

def download_file(self, data_product, local_path, cache=True, verbose=True):
"""
Expand Down
63 changes: 58 additions & 5 deletions astroquery/mast/observations.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,26 +774,56 @@ def download_products(self, products, *, download_dir=None, flat=False,

return manifest

def get_cloud_uris(self, data_products, *, include_bucket=True, full_url=False):
def get_cloud_uris(self, data_products=None, *, include_bucket=True, full_url=False, pagesize=None, page=None,
mrp_only=False, extension=None, filter_products={}, **criteria):
"""
Takes an `~astropy.table.Table` of data products and returns the associated cloud data uris.
Given an `~astropy.table.Table` of data products or query criteria and filter parameters,
returns the associated cloud data URIs.
Parameters
----------
data_products : `~astropy.table.Table`
Table containing products to be converted into cloud data uris.
Table containing products to be converted into cloud data uris. If provided, this will supercede
page_size, page, or any keyword arguments passed in as criteria.
include_bucket : bool
Default True. When false returns the path of the file relative to the
Default True. When False, returns the path of the file relative to the
top level cloud storage location.
Must be set to False when using the full_url argument.
full_url : bool
Default False. Return an HTTP fetchable url instead of a cloud uri.
Must set include_bucket to False to use this option.
pagesize : int, optional
Default None. Can be used to override the default pagesize when making a query.
E.g. when using a slow internet connection. Query criteria must also be provided.
page : int, optional
Default None. Can be used to override the default behavior of all results being returned for a query
to obtain one specific page of results. Query criteria must also be provided.
mrp_only : bool, optional
Default False. When set to True, only "Minimum Recommended Products" will be returned.
extension : string or array, optional
Default None. Option to filter by file extension.
filter_products : dict, optional
Filters to be applied to data products. Valid filters are all products fields listed
`here <https://masttest.stsci.edu/api/v0/_productsfields.html>`__.
The column name as a string is the key. The corresponding value is one
or more acceptable values for that parameter.
Filter behavior is AND between the filters and OR within a filter set.
For example: {"productType": "SCIENCE", "extension"=["fits","jpg"]}
**criteria
Criteria to apply. At least one non-positional criteria must be supplied.
Valid criteria are coordinates, objectname, radius (as in `query_region` and `query_object`),
and all observation fields returned by the ``get_metadata("observations")``.
The Column Name is the keyword, with the argument being one or more acceptable values for that parameter,
except for fields with a float datatype where the argument should be in the form [minVal, maxVal].
For non-float type criteria wildcards maybe used (both * and % are considered wildcards), however
only one wildcarded value can be processed per criterion.
RA and Dec must be given in decimal degrees, and datetimes in MJD.
For example: filters=["FUV","NUV"],proposal_pi="Ost*",t_max=[52264.4586,54452.8914]
Returns
-------
response : list
List of URIs generated from the data products, list way contain entries that are None
List of URIs generated from the data products. May contain entries that are None
if data_products includes products not found in the cloud.
"""

Expand All @@ -802,6 +832,29 @@ def get_cloud_uris(self, data_products, *, include_bucket=True, full_url=False):
'Please enable anonymous cloud access by calling `enable_cloud_dataset` method. '
'Refer to `~astroquery.mast.ObservationsClass.enable_cloud_dataset` documentation for more info.')

if data_products is None:
if not criteria:
raise InvalidQueryError(
'Please provide either a `~astropy.table.Table` of data products or query criteria.'
)
else:
# Get table of observations based on query criteria
obs = self.query_criteria(pagesize=pagesize, page=page, **criteria)

if not len(obs):
# Warning raised by ~astroquery.mast.ObservationsClass.query_criteria
return

# Return list of associated data products
data_products = self.get_product_list(obs)

# Filter product list
data_products = self.filter_products(data_products, mrp_only=mrp_only, extension=extension, **filter_products)

if not len(data_products):
warnings.warn("No matching products to fetch associated cloud URIs.", NoResultsWarning)
return

# Remove duplicate products
data_products = self._remove_duplicate_products(data_products)

Expand Down
37 changes: 34 additions & 3 deletions astroquery/mast/tests/test_mast_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,12 +526,13 @@ def test_get_cloud_uri(self, test_data_uri, expected_cloud_uri):
assert len(uri) > 0, f'Product for dataURI {test_data_uri} was not found in the cloud.'
assert uri == expected_cloud_uri, f'Cloud URI does not match expected. ({uri} != {expected_cloud_uri})'

def test_get_cloud_uris(self):
@pytest.mark.parametrize("test_obs_id", ["25568122", "31411"])
def test_get_cloud_uris(self, test_obs_id):
pytest.importorskip("boto3")
test_obs_id = '25568122'

# get a product list
products = Observations.get_product_list(test_obs_id)[24:]
index = 24 if test_obs_id == '25568122' else 0
products = Observations.get_product_list(test_obs_id)[index:]

assert len(products) > 0, (f'No products found for OBSID {test_obs_id}. '
'Unable to move forward with getting URIs from the cloud.')
Expand All @@ -544,6 +545,36 @@ def test_get_cloud_uris(self):

assert len(uris) > 0, f'Products for OBSID {test_obs_id} were not found in the cloud.'

# check for warning if no data products match filters
with pytest.warns(NoResultsWarning):
Observations.get_cloud_uris(products,
extension='png')

def test_get_cloud_uris_query(self):
pytest.importorskip("boto3")

# enable access to public AWS S3 bucket
Observations.enable_cloud_dataset()

# get uris with other functions
obs = Observations.query_criteria(target_name=234295610)
prod = Observations.get_product_list(obs)
filt = Observations.filter_products(prod, calib_level=[2])
s3_uris = Observations.get_cloud_uris(filt)

# get uris with streamlined function
uris = Observations.get_cloud_uris(target_name=234295610,
filter_products={'calib_level': [2]})
assert s3_uris == uris

# check that InvalidQueryError is thrown if neither data_products or **criteria are defined
with pytest.raises(InvalidQueryError):
Observations.get_cloud_uris(filter_products={'calib_level': [2]})

# check for warning if query returns no observations
with pytest.warns(NoResultsWarning):
Observations.get_cloud_uris(target_name=234295611)

######################
# CatalogClass tests #
######################
Expand Down
56 changes: 44 additions & 12 deletions astroquery/mast/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,22 +158,54 @@ def parse_input_location(coordinates=None, objectname=None):

def mast_relative_path(mast_uri):
"""
Given a MAST dataURI, return the associated relative path.
Given one or more MAST dataURI(s), return the associated relative path(s).
Parameters
----------
mast_uri : str
The MAST uri.
mast_uri : str, list of str
The MAST uri(s).
Returns
-------
response : str
The associated relative path.
response : str, list of str
The associated relative path(s).
"""

response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/",
{"uri": mast_uri})
result = response.json()
uri_result = result.get(mast_uri)

return uri_result["path"]
if isinstance(mast_uri, str):
uri_list = [("uri", mast_uri)]
else: # mast_uri parameter is a list
uri_list = [("uri", uri) for uri in mast_uri]

# Split the list into chunks of 50 URIs; this is necessary
# to avoid "414 Client Error: Request-URI Too Large".
uri_list_chunks = list(_split_list_into_chunks(uri_list, chunk_size=50))

result = []
for chunk in uri_list_chunks:
response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/",
{"uri": chunk})
json_response = response.json()

for uri in chunk:
# Chunk is a list of tuples where the tuple is
# ("uri", "/path/to/product")
# so we index for path (index=1)
path = json_response.get(uri[1])["path"]
if 'galex' in path:
path = path.lstrip("/mast/")
elif '/ps1/' in path:
path = path.replace("/ps1/", "panstarrs/ps1/public/")
else:
path = path.lstrip("/")
result.append(path)

# If the input was a single URI string, we return a single string
if isinstance(mast_uri, str):
return result[0]
# Else, return a list of paths
return result


def _split_list_into_chunks(input_list, chunk_size):
"""Helper function for `mast_relative_path`."""
for idx in range(0, len(input_list), chunk_size):
yield input_list[idx:idx + chunk_size]
23 changes: 11 additions & 12 deletions docs/mast/mast_catalog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ The returned fields vary by catalog, find the field documentation for specific c
`here <https://mast.stsci.edu/api/v0/pages.html>`__.
If no catalog is specified, the Hubble Source Catalog will be queried.


.. doctest-remote-data::

>>> from astroquery.mast import Catalogs
...
>>> catalog_data = Catalogs.query_object("158.47924 -7.30962", catalog="Galex")
>>> print(catalog_data[:10])
>>> print(catalog_data[:10]) # doctest: +IGNORE_OUTPUT
distance_arcmin objID survey ... fuv_flux_aper_7 fuv_artifact
------------------ ------------------- ------ ... --------------- ------------
0.3493802506329695 6382034098673685038 AIS ... 0.047751952 0
Expand Down Expand Up @@ -261,19 +262,17 @@ Given an HSC Match ID, return all catalog results.
>>> catalog_data = Catalogs.query_object("M10", radius=.02, catalog="HSC")
>>> matchid = catalog_data[0]["MatchID"]
>>> print(matchid)
63980492
7542452
>>> matches = Catalogs.query_hsc_matchid(matchid)
>>> print(matches)
CatID MatchID ... cd_matrix
--------- -------- ... ------------------------------------------------------
257195287 63980492 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005
257440119 63980492 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005
428373428 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
428373427 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
428373429 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
410574499 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
410574498 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
410574497 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
CatID MatchID ... cd_matrix
--------- ------- ... ------------------------------------------------------
419094794 7542452 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
419094795 7542452 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
401289578 7542452 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
401289577 7542452 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
257194049 7542452 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005
257438887 7542452 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005


HSC spectra accessed through this class as well. `~astroquery.mast.CatalogsClass.get_hsc_spectra`
Expand Down
Loading

0 comments on commit cc25c37

Please sign in to comment.