diff --git a/.copier-answers.yml b/.copier-answers.yml index 44cede77..886b28f1 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -15,9 +15,9 @@ include_benchmarks: true include_docs: true include_notebooks: true mypy_type_checking: basic -package_name: hipscat_import +package_name: hats_import project_license: BSD -project_name: hipscat-import +project_name: hats-import project_organization: astronomy-commons python_versions: - '3.9' diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 6a7b51dd..8d01c723 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -26,7 +26,7 @@ If it fixes an open issue, please link to the issue here. If this PR closes an i ## Code Quality -- [ ] I have read the [Contribution Guide](https://hipscat-import.readthedocs.io/en/stable/guide/contributing.html) and [LINCC Frameworks Code of Conduct](https://lsstdiscoveryalliance.org/programs/lincc-frameworks/code-conduct/) +- [ ] I have read the [Contribution Guide](https://hats-import.readthedocs.io/en/stable/guide/contributing.html) and [LINCC Frameworks Code of Conduct](https://lsstdiscoveryalliance.org/programs/lincc-frameworks/code-conduct/) - [ ] My code follows the code style of this project - [ ] My code builds (or compiles) cleanly without any errors or warnings - [ ] My code contains relevant comments and necessary documentation diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 49231cf6..ca15234a 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -32,7 +32,7 @@ jobs: python -m pip install --upgrade pip pip install . - name: Create lock requirements file - run: pip list --format=freeze --exclude "hipscat-import" > requirements.txt + run: pip list --format=freeze --exclude "hats-import" > requirements.txt - name: Install dev dependencies run: pip install .[dev] - name: Run unit tests with pytest diff --git a/.github/workflows/testing-and-coverage.yml b/.github/workflows/testing-and-coverage.yml index bb6c1668..6a2a7c7a 100644 --- a/.github/workflows/testing-and-coverage.yml +++ b/.github/workflows/testing-and-coverage.yml @@ -31,7 +31,7 @@ jobs: if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Run unit tests with pytest run: | - python -m pytest tests --cov=hipscat_import --cov-report=xml + python -m pytest tests --cov=hats_import --cov-report=xml - name: Run dask-on-ray tests with pytest run: | python -m pytest tests --use_ray diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d06c1c1a..29bd788c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -101,7 +101,7 @@ repos: entry: mypy language: system types: [python] - files: ^(src|tests)/ + files: ^src/ args: [ "--ignore-missing-imports", # Ignore imports without type hints diff --git a/README.md b/README.md index a7f2805b..b4d56657 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,33 @@ -# hipscat-import +# hats-import [![Template](https://img.shields.io/badge/Template-LINCC%20Frameworks%20Python%20Project%20Template-brightgreen)](https://lincc-ppt.readthedocs.io/en/stable/) -[![PyPI](https://img.shields.io/pypi/v/hipscat-import?color=blue&logo=pypi&logoColor=white)](https://pypi.org/project/hipscat-import/) -[![Conda](https://img.shields.io/conda/vn/conda-forge/hipscat-import.svg?color=blue&logo=condaforge&logoColor=white)](https://anaconda.org/conda-forge/hipscat-import) +[![PyPI](https://img.shields.io/pypi/v/hats-import?color=blue&logo=pypi&logoColor=white)](https://pypi.org/project/hats-import/) +[![Conda](https://img.shields.io/conda/vn/conda-forge/hats-import.svg?color=blue&logo=condaforge&logoColor=white)](https://anaconda.org/conda-forge/hats-import) -[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/astronomy-commons/hipscat-import/smoke-test.yml)](https://github.com/astronomy-commons/hipscat-import/actions/workflows/smoke-test.yml) -[![codecov](https://codecov.io/gh/astronomy-commons/hipscat-import/branch/main/graph/badge.svg)](https://codecov.io/gh/astronomy-commons/hipscat-import) -[![Read the Docs](https://img.shields.io/readthedocs/hipscat-import)](https://hipscat-import.readthedocs.io/) +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/astronomy-commons/hats-import/smoke-test.yml)](https://github.com/astronomy-commons/hats-import/actions/workflows/smoke-test.yml) +[![codecov](https://codecov.io/gh/astronomy-commons/hats-import/branch/main/graph/badge.svg)](https://codecov.io/gh/astronomy-commons/hats-import) +[![Read the Docs](https://img.shields.io/readthedocs/hats-import)](https://hats-import.readthedocs.io/) -## HiPSCat import - Utility for ingesting large survey data into HiPSCat structure. +## HATS import - Utility for ingesting large survey data into HATS structure. -Check out our [ReadTheDocs site](https://hipscat-import.readthedocs.io/en/stable/) +Check out our [ReadTheDocs site](https://hats-import.readthedocs.io/en/stable/) for more information on partitioning, installation, and contributing. See related projects: -* HiPSCat ([on GitHub](https://github.com/astronomy-commons/hipscat)) - ([on ReadTheDocs](https://hipscat.readthedocs.io/en/stable/)) +* HATS ([on GitHub](https://github.com/astronomy-commons/hats)) + ([on ReadTheDocs](https://hats.readthedocs.io/en/stable/)) * LSDB ([on GitHub](https://github.com/astronomy-commons/lsdb)) ([on ReadTheDocs](https://lsdb.readthedocs.io/en/stable/)) ## Contributing -[![GitHub issue custom search in repo](https://img.shields.io/github/issues-search/astronomy-commons/hipscat-import?color=purple&label=Good%20first%20issues&query=is%3Aopen%20label%3A%22good%20first%20issue%22)](https://github.com/astronomy-commons/hipscat-import/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) +[![GitHub issue custom search in repo](https://img.shields.io/github/issues-search/astronomy-commons/hats-import?color=purple&label=Good%20first%20issues&query=is%3Aopen%20label%3A%22good%20first%20issue%22)](https://github.com/astronomy-commons/hats-import/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) -See the [contribution guide](https://hipscat-import.readthedocs.io/en/stable/guide/contributing.html) +See the [contribution guide](https://hats-import.readthedocs.io/en/stable/guide/contributing.html) for complete installation instructions and contribution best practices. ## Acknowledgements diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json index fbe36f21..1c4c537e 100644 --- a/benchmarks/asv.conf.json +++ b/benchmarks/asv.conf.json @@ -3,9 +3,9 @@ // you know what you are doing. "version": 1, // The name of the project being benchmarked. - "project": "hipscat-import", + "project": "hats-import", // The project's homepage. - "project_url": "https://github.com/astronomy-commons/hipscat-import", + "project_url": "https://github.com/astronomy-commons/hats-import", // The URL or local path of the source code repository for the // project being benchmarked. "repo": "..", @@ -32,7 +32,7 @@ // variable. "environment_type": "virtualenv", // the base URL to show a commit for the project. - "show_commit_url": "https://github.com/astronomy-commons/hipscat-import/commit/", + "show_commit_url": "https://github.com/astronomy-commons/hats-import/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. "pythons": [ diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 061dc651..86f36e98 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -3,8 +3,8 @@ import numpy as np -from hipscat_import.catalog.resume_plan import ResumePlan -from hipscat_import.catalog.sparse_histogram import SparseHistogram +from hats_import.catalog.resume_plan import ResumePlan +from hats_import.catalog.sparse_histogram import SparseHistogram class BinningSuite: diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst index dc3f1b6e..fa34ff76 100644 --- a/docs/catalogs/arguments.rst +++ b/docs/catalogs/arguments.rst @@ -9,7 +9,7 @@ A minimal arguments block will look something like: .. code-block:: python - from hipscat_import.catalog.arguments import ImportArguments + from hats_import.catalog.arguments import ImportArguments args = ImportArguments( sort_columns="ObjectID", @@ -25,8 +25,8 @@ A minimal arguments block will look something like: More details on each of these parameters is provided in sections below. For the curious, see the API documentation for -:py:class:`hipscat_import.catalog.arguments.ImportArguments`, and its superclass -:py:class:`hipscat_import.runtime_arguments.RuntimeArguments`. +:py:class:`hats_import.catalog.arguments.ImportArguments`, and its superclass +:py:class:`hats_import.runtime_arguments.RuntimeArguments`. Pipeline setup ------------------------------------------------------------------------------- @@ -52,7 +52,7 @@ to the pipeline, ignoring the above arguments. This would look like: .. code-block:: python from dask.distributed import Client - from hipscat_import.pipeline import pipeline_with_client + from hats_import.pipeline import pipeline_with_client args = ... # ImportArguments() with Client('scheduler:port') as client: @@ -63,7 +63,7 @@ potentially avoid some python threading issues with dask: .. code-block:: python - from hipscat_import.pipeline import pipeline + from hats_import.pipeline import pipeline def import_pipeline(): args = ... @@ -88,14 +88,14 @@ files are found, we will restore the pipeline's previous progress. If you want to start the pipeline from scratch you can simply set `resume=False`. Alternatively, go to the temp directory you've specified and remove any intermediate -files created by the previous runs of the ``hipscat-import`` pipeline. You should also +files created by the previous runs of the ``hats-import`` pipeline. You should also remove the output directory if it has any content. The resume argument performs these cleaning operations automatically for you. Reading input files ------------------------------------------------------------------------------- -Catalog import reads through a list of files and converts them into a hipscatted catalog. +Catalog import reads through a list of files and converts them into a hats-sharded catalog. Which files? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -134,7 +134,7 @@ to parse a whitespace separated file. Otherwise, you can use a short string to specify an existing file reader type e.g. ``file_reader="csv"``. You can find the full API documentation for -:py:class:`hipscat_import.catalog.file_readers.InputReader` +:py:class:`hats_import.catalog.file_readers.InputReader` .. code-block:: python @@ -150,13 +150,6 @@ You can find the full API documentation for smaller_table = filter_nonsense(smaller_table) yield smaller_table.to_pandas() - def provenance_info(self) -> dict: - provenance_info = { - "input_reader_type": "StarrReader", - "chunksize": self.chunksize, - } - return provenance_info - ... args = ImportArguments( @@ -206,18 +199,18 @@ Which fields? Specify the ``ra_column`` and ``dec_column`` for the dataset. -There are two fields that we require in order to make a valid hipscatted +There are two fields that we require in order to make a valid hats-sharded catalog, the right ascension and declination. At this time, this is the only supported system for celestial coordinates. -If you're importing data that has previously been hipscatted, you may use -``use_hipscat_index = True``. This will use that previously compused hipscat spatial +If you're importing data that has previously been hats-sharded, you may use +``use_healpix_29 = True``. This will use that previously computed hats spatial index as the position, instead of ra/dec. Healpix order and thresholds ------------------------------------------------------------------------------- -When creating a new catalog through the hipscat-import process, we try to +When creating a new catalog through the hats-import process, we try to create partitions with approximately the same number of rows per partition. This isn't perfect, because the sky is uneven, but we still try to create smaller-area pixels in more dense areas, and larger-area pixels in less dense @@ -322,19 +315,19 @@ How? You may want to tweak parameters of the final catalog output, and we have helper arguments for a few of those. -``add_hipscat_index`` - ``bool`` - whether or not to add the hipscat spatial index -as a column in the resulting catalog. The ``_hipscat_index`` field is designed to make many +``add_healpix_29`` - ``bool`` - whether or not to add the hats spatial index +as a column in the resulting catalog. The ``_healpix_29`` field is designed to make many dask operations more performant, but if you do not intend to publish your dataset and do not intend to use dask, then you can suppress generation of this column to save a little space in your final disk usage. -The ``_hipscat_index`` uses a high healpix order and a uniqueness counter to create +The ``_healpix_29`` uses a high healpix order to create values that can order all points in the sky, according to a nested healpix scheme. ``sort_columns`` - ``str`` - column for survey identifier, or other sortable column. If sorting by multiple columns, they should be comma-separated. -If ``add_hipscat_index=True``, this sorting will be used to resolve the -index counter within the same higher-order pixel space. +If ``add_healpix_29=True``, ``_healpix_29`` will be the primary sort key, but the +provided sorting will be used for any rows within the same higher-order pixel space. ``use_schema_file`` - ``str`` - path to a parquet file with schema metadata. This will be used for column metadata when writing the files, if specified. @@ -346,8 +339,6 @@ parquet files with the catalog data, and will only generate root-level metadata files representing the full statistics of the final catalog. This can be useful when probing the import process for effectiveness on processing a target dataset. -``epoch`` - ``str`` - astronomical epoch for the data. defaults to ``"J2000"`` - ``catalog_type`` - ``"object"`` or ``"source"``. Indicates the level of catalog data, using the LSST nomenclature: diff --git a/docs/catalogs/public/allwise.rst b/docs/catalogs/public/allwise.rst index edf9b4e2..0daa246d 100644 --- a/docs/catalogs/public/allwise.rst +++ b/docs/catalogs/public/allwise.rst @@ -32,9 +32,9 @@ Example import import pandas as pd - import hipscat_import.pipeline as runner - from hipscat_import.catalog.arguments import ImportArguments - from hipscat_import.catalog.file_readers import CsvReader + import hats_import.pipeline as runner + from hats_import.catalog.arguments import ImportArguments + from hats_import.catalog.file_readers import CsvReader # Load the column names and types from a side file. type_frame = pd.read_csv("allwise_types.csv") diff --git a/docs/catalogs/public/neowise.rst b/docs/catalogs/public/neowise.rst index 4a21fd8c..5f7657b3 100644 --- a/docs/catalogs/public/neowise.rst +++ b/docs/catalogs/public/neowise.rst @@ -32,9 +32,9 @@ Example import import pandas as pd - import hipscat_import.pipeline as runner - from hipscat_import.catalog.arguments import ImportArguments - from hipscat_import.catalog.file_readers import CsvReader + import hats_import.pipeline as runner + from hats_import.catalog.arguments import ImportArguments + from hats_import.catalog.file_readers import CsvReader # Load the column names and types from a side file. type_frame = pd.read_csv("neowise_types.csv") diff --git a/docs/catalogs/public/panstarrs.rst b/docs/catalogs/public/panstarrs.rst index c5141d8f..edcac8d6 100644 --- a/docs/catalogs/public/panstarrs.rst +++ b/docs/catalogs/public/panstarrs.rst @@ -30,9 +30,9 @@ Example import of objects (otmo) import pandas as pd - import hipscat_import.pipeline as runner - from hipscat_import.catalog.arguments import ImportArguments - from hipscat_import.catalog.file_readers import CsvReader + import hats_import.pipeline as runner + from hats_import.catalog.arguments import ImportArguments + from hats_import.catalog.file_readers import CsvReader # Load the column names and types from a side file. type_frame = pd.read_csv("ps1_otmo_types.csv") diff --git a/docs/catalogs/public/sdss.rst b/docs/catalogs/public/sdss.rst index 6ad74cc7..fb342644 100644 --- a/docs/catalogs/public/sdss.rst +++ b/docs/catalogs/public/sdss.rst @@ -64,8 +64,8 @@ Example import .. code-block:: python - from hipscat_import.catalog.arguments import ImportArguments - import hipscat_import.pipeline as runner + from hats_import.catalog.arguments import ImportArguments + import hats_import.pipeline as runner args = ImportArguments( output_artifact_name="sdss_dr16q", diff --git a/docs/catalogs/public/tic.rst b/docs/catalogs/public/tic.rst index 9376347e..1902cb19 100644 --- a/docs/catalogs/public/tic.rst +++ b/docs/catalogs/public/tic.rst @@ -30,9 +30,9 @@ Example import import pandas as pd - import hipscat_import.pipeline as runner - from hipscat_import.catalog.arguments import ImportArguments - from hipscat_import.catalog.file_readers import CsvReader + import hats_import.pipeline as runner + from hats_import.catalog.arguments import ImportArguments + from hats_import.catalog.file_readers import CsvReader type_frame = pd.read_csv("tic_types.csv") type_map = dict(zip(type_frame["name"], type_frame["type"])) diff --git a/docs/catalogs/public/zubercal.rst b/docs/catalogs/public/zubercal.rst index d2a9e1c9..97835dc7 100644 --- a/docs/catalogs/public/zubercal.rst +++ b/docs/catalogs/public/zubercal.rst @@ -32,9 +32,9 @@ Challenges with this data set .. code-block:: python - import hipscat_import.pipeline as runner - from hipscat_import.catalog.arguments import ImportArguments - from hipscat_import.catalog.file_readers import ParquetReader + import hats_import.pipeline as runner + from hats_import.catalog.arguments import ImportArguments + from hats_import.catalog.file_readers import ParquetReader import pyarrow.parquet as pq import pyarrow as pa import re diff --git a/docs/catalogs/temp_files.rst b/docs/catalogs/temp_files.rst index 451bc990..a752084d 100644 --- a/docs/catalogs/temp_files.rst +++ b/docs/catalogs/temp_files.rst @@ -1,7 +1,7 @@ Temporary files and disk usage =============================================================================== -This page aims to characterize intermediate files created by the hipscat-import +This page aims to characterize intermediate files created by the hats-import catalog creation process. Most users are going to be ok with setting the ``tmp_dir`` and not thinking much more about it. @@ -90,7 +90,7 @@ Some more explanation: What's happening when ------------------------------------------------------------------------------- -The hipscat-import catalog creation process generates a lot of temporary files. Some find this +The hats-import catalog creation process generates a lot of temporary files. Some find this surprising, so we try to provide a narrative of what's happening and why. Planning stage @@ -159,7 +159,7 @@ This is when storage shifts from intermediate files to the real output files. Finishing stage ............................................................................... -Here, we will write out a few additional final files (e.g. ``catalog_info.json``, ``_metadata``). +Here, we will write out a few additional final files (e.g. ``properties``, ``_metadata``). Additionally, we will clean up any straggling intermediate resume files. This includes all text log files, and the summed histogram file. After this stage, we should have zero intermediate files. @@ -196,10 +196,10 @@ final catalog can be very different from the on-disk size of the input files. In our internal testing, we converted a number of different kinds of catalogs, and share some of the results with you, to give some suggestion of the disk requirements -you may face when converting your own catalogs to hipscat format. +you may face when converting your own catalogs to hats format. ============= =============== =========== =============== ========================= -Catalog Input size (-h) Input size Hipscatted size Ratio +Catalog Input size (-h) Input size HATS size Ratio ============= =============== =========== =============== ========================= allwise 1.2T 1196115700 310184460 0.26 (a lot smaller) neowise 3.9T 4177447284 4263269112 1.02 (about the same) @@ -213,4 +213,4 @@ Notes: - allwise, neowise, and tic were all originally compressed CSV files. - sdss was originally a series of fits files - zubercal was originally 500k parquet files, and is reduced in the example to - around 70k hipscat parquet files. + around 70k hats parquet files. diff --git a/docs/conf.py b/docs/conf.py index a41343d2..eaf804a4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,10 +14,10 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = "hipscat-import" +project = "hats-import" copyright = "2023, LINCC Frameworks" author = "LINCC Frameworks" -release = version("hipscat-import") +release = version("hats-import") # for example take major/minor version = ".".join(release.split(".")[:2]) @@ -80,8 +80,8 @@ ## lets us suppress the copy button on select code blocks. copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre" -# Cross-link hipscat documentation from the API reference: +# Cross-link hats documentation from the API reference: # https://docs.readthedocs.io/en/stable/guides/intersphinx.html intersphinx_mapping = { - "hipscat": ("http://hipscat.readthedocs.io/en/stable/", None), + "hats": ("http://hats.readthedocs.io/en/stable/", None), } diff --git a/docs/guide/contact.rst b/docs/guide/contact.rst index 48fa9647..5645b658 100644 --- a/docs/guide/contact.rst +++ b/docs/guide/contact.rst @@ -6,7 +6,7 @@ We at LINCC Frameworks pride ourselves on being a friendly bunch! If you're encountering issues, have some gnarly dataset, have ideas for making our products better, or pretty much anything else, reach out! -* Open an issue in our github repo for hipscat-import - * https://github.com/astronomy-commons/hipscat-import/issues/new +* Open an issue in our github repo for hats-import + * https://github.com/astronomy-commons/hats-import/issues/new * If you're on LSSTC slack, so are we! `#lincc-frameworks-qa `_ \ No newline at end of file diff --git a/docs/guide/contributing.rst b/docs/guide/contributing.rst index b2e1f545..7aa478a8 100644 --- a/docs/guide/contributing.rst +++ b/docs/guide/contributing.rst @@ -1,4 +1,4 @@ -Contributing to hipscat-import +Contributing to hats-import =============================================================================== Find (or make) a new GitHub issue diff --git a/docs/guide/dask_on_ray.rst b/docs/guide/dask_on_ray.rst index a80ade10..53d290d2 100644 --- a/docs/guide/dask_on_ray.rst +++ b/docs/guide/dask_on_ray.rst @@ -8,7 +8,7 @@ See more on Ray's site: https://docs.ray.io/en/latest/ray-more-libs/dask-on-ray.html -How to use in hipscat-import pipelines +How to use in hats-import pipelines ------------------------------------------------------------------------------- Install ray @@ -27,7 +27,7 @@ You should also disable ray when you're done, just to clean things up. from dask.distributed import Client from ray.util.dask import disable_dask_on_ray, enable_dask_on_ray - from hipscat_import.pipeline import pipeline_with_client + from hats_import.pipeline import pipeline_with_client with ray.init( num_cpus=args.dask_n_workers, diff --git a/docs/guide/index_table.rst b/docs/guide/index_table.rst index eb816bf2..9bff8ea1 100644 --- a/docs/guide/index_table.rst +++ b/docs/guide/index_table.rst @@ -2,7 +2,7 @@ Index Table =============================================================================== This page discusses topics around setting up a pipeline to generate a secondary -index lookup for a field on an existing hipscat catalog on disk. +index lookup for a field on an existing hats catalog on disk. This is useful if you would like to have quick access to rows of your table using a survey-provided unique identifier that is NOT spatially correlated. To find @@ -15,7 +15,7 @@ and where to put the output files. A minimal arguments block will look something .. code-block:: python - from hipscat_import.index.arguments import IndexArguments + from hats_import.index.arguments import IndexArguments args = IndexArguments( input_catalog_path="./my_data/my_catalog", @@ -27,8 +27,8 @@ and where to put the output files. A minimal arguments block will look something More details on each of these parameters is provided in sections below. For the curious, see the API documentation for -:py:class:`hipscat_import.index.arguments.IndexArguments`, -and its superclass :py:class:`hipscat_import.runtime_arguments.RuntimeArguments`. +:py:class:`hats_import.index.arguments.IndexArguments`, +and its superclass :py:class:`hats_import.runtime_arguments.RuntimeArguments`. Dask setup ------------------------------------------------------------------------------- @@ -51,7 +51,7 @@ to the pipeline, ignoring the above arguments. This would look like: .. code-block:: python from dask.distributed import Client - from hipscat_import.pipeline import pipeline_with_client + from hats_import.pipeline import pipeline_with_client args = IndexArguments(...) with Client('scheduler:port') as client: @@ -62,7 +62,7 @@ potentially avoid some python threading issues with dask: .. code-block:: python - from hipscat_import.pipeline import pipeline + from hats_import.pipeline import pipeline def index_pipeline(): args = IndexArguments(...) @@ -75,7 +75,7 @@ Input Catalog ------------------------------------------------------------------------------- For this pipeline, you will need to have already transformed your catalog into -hipscat parquet format. Provide the path to the catalog data with the argument +hats parquet format. Provide the path to the catalog data with the argument ``input_catalog_path``. ``indexing_column`` is required, and is the column that you would like to create @@ -120,7 +120,7 @@ string sorting will be smart enough to collate the various strings appropriately .. code-block:: python - divisions = [f"Gaia DR3 {i}" for i in range(10000, 99999, 12)] + divisions = [f"Gaia DR3 {i}" for i in range(10_000, 99_999, 12)] divisions.append("Gaia DR3 999999988604363776") Getting hints from ``_metadata`` @@ -149,8 +149,8 @@ list along to your ``ImportArguments``! import numpy as np import os - from hipscat.io.parquet_metadata import write_parquet_metadata - from hipscat.io import file_io + from hats.io.parquet_metadata import write_parquet_metadata + from hats.io import file_io ## Specify the catalog and column you're making your index over. input_catalog_path="/data/input_catalog" @@ -249,10 +249,8 @@ arguments for a few of those. ``compute_partition_size`` - ``int`` - partition size used when computing the leaf parquet files. -``include_hipscat_index`` - ``bool`` - whether or not to include the 64-bit -hipscat spatial index in the index table. Defaults to ``True``. It can be -useful to keep this value if the ``_hipscat_index`` is your only unique -identifier, or you intend to re-partition your data. +``include_healpix_29`` - ``bool`` - whether or not to include the 64-bit +hats spatial index in the index table. Defaults to ``True``. ``include_order_pixel`` - ``bool`` - whether to include partitioning columns, ``Norder``, ``Dir``, and ``Npix``. You probably want to keep these! @@ -261,7 +259,7 @@ when trying to use the index table. ``drop_duplicates`` - ``bool`` - drop duplicate occurrences of all fields that are included in the index table. This is enabled by default, but can be -**very** slow. This has an interaction with the above ``include_hipscat_index`` +**very** slow. This has an interaction with the above ``include_healpix_29`` and ``include_order_pixel`` options above. We desribe some common patterns below: - I want to create an index over the target ID in my catalog. There are no @@ -270,8 +268,7 @@ and ``include_order_pixel`` options above. We desribe some common patterns below .. code-block:: python indexing_column="target_id", - # target_id is unique, and I don't need to keep extra data - include_hipscat_index=False, + include_healpix_29=False, # I want to know where my data is in the sky. include_order_pixel=True, # target_id is unique, and I don't need to do extra work to de-duplicate @@ -287,7 +284,7 @@ and ``include_order_pixel`` options above. We desribe some common patterns below indexing_column="target_id", # target_id is NOT unique drop_duplicates=True, - # target_id is NOT unique, but including the _hipscat_index will bloat results - include_hipscat_index=False, + # including the _healpix_29 will bloat results + include_healpix_29=False, # I want to know where my data is in the sky. include_order_pixel=True, diff --git a/docs/guide/margin_cache.rst b/docs/guide/margin_cache.rst index d481b519..613df6cb 100644 --- a/docs/guide/margin_cache.rst +++ b/docs/guide/margin_cache.rst @@ -6,14 +6,14 @@ For more discussion of the whys and hows of margin caches, please see for more information. This page discusses topics around setting up a pipeline to generate a margin -cache from an existing hipscat catalog on disk. +cache from an existing hats catalog on disk. At a minimum, you need arguments that include where to find the input files, and where to put the output files. A minimal arguments block will look something like: .. code-block:: python - from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments + from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments args = MarginCacheArguments( input_catalog_path="./my_data/my_catalog", @@ -26,8 +26,8 @@ and where to put the output files. A minimal arguments block will look something More details on each of these parameters is provided in sections below. For the curious, see the API documentation for -:py:class:`hipscat_import.margin_cache.margin_cache_arguments.MarginCacheArguments`, -and its superclass :py:class:`hipscat_import.runtime_arguments.RuntimeArguments`. +:py:class:`hats_import.margin_cache.margin_cache_arguments.MarginCacheArguments`, +and its superclass :py:class:`hats_import.runtime_arguments.RuntimeArguments`. Dask setup ------------------------------------------------------------------------------- @@ -50,7 +50,7 @@ to the pipeline, ignoring the above arguments. This would look like: .. code-block:: python from dask.distributed import Client - from hipscat_import.pipeline import pipeline_with_client + from hats_import.pipeline import pipeline_with_client args = MarginCacheArguments(...) with Client('scheduler:port') as client: @@ -61,7 +61,7 @@ potentially avoid some python threading issues with dask: .. code-block:: python - from hipscat_import.pipeline import pipeline + from hats_import.pipeline import pipeline def margin_pipeline(): args = MarginCacheArguments(...) @@ -74,10 +74,10 @@ Input Catalog ------------------------------------------------------------------------------- For this pipeline, you will need to have already transformed your catalog into -hipscat parquet format. Provide the path to the catalog data with the argument +hats parquet format. Provide the path to the catalog data with the argument ``input_catalog_path``. -The input hipscat catalog will provide its own right ascension and declination +The input hats catalog will provide its own right ascension and declination that will be used when computing margin populations. Margin calculation parameters diff --git a/docs/index.rst b/docs/index.rst index 57852de1..01d6bb7f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,25 +1,25 @@ -HiPSCat Import +HATS Import ======================================================================================== -Utility for ingesting large survey data into HiPSCat structure. +Utility for ingesting large survey data into HATS structure. Installation ------------------------------------------------------------------------------- We recommend installing in a virtual environment, like venv or conda. You may -need to install or upgrade versions of dependencies to work with hipscat-import. +need to install or upgrade versions of dependencies to work with hats-import. .. code-block:: console - pip install hipscat-import + pip install hats-import .. tip:: Installing on Mac - ``healpy`` is a very necessary dependency for hipscat libraries at this time, but + ``healpy`` is a very necessary dependency for hats libraries at this time, but native prebuilt binaries for healpy on Apple Silicon Macs `do not yet exist `_, - so it's recommended to install via conda before proceeding to hipscat-import. + so it's recommended to install via conda before proceeding to hats-import. .. code-block:: console @@ -29,7 +29,7 @@ need to install or upgrade versions of dependencies to work with hipscat-import. Setting up a pipeline ------------------------------------------------------------------------------- -For each type of dataset the hipscat-import tool can generate, there is an argument +For each type of dataset the hats-import tool can generate, there is an argument container class that you will need to instantiate and populate with relevant arguments. See dataset-specific notes on arguments: @@ -45,7 +45,7 @@ threading issues with dask: .. code-block:: python from dask.distributed import Client - from hipscat_import.pipeline import pipeline_with_client + from hats_import.pipeline import pipeline_with_client def main(): args = ... diff --git a/docs/notebooks/estimate_pixel_threshold.ipynb b/docs/notebooks/estimate_pixel_threshold.ipynb index 670192bc..0908268e 100755 --- a/docs/notebooks/estimate_pixel_threshold.ipynb +++ b/docs/notebooks/estimate_pixel_threshold.ipynb @@ -11,7 +11,7 @@ "\n", "**Background**\n", "\n", - "When creating a new catalog through the hipscat-import process, we try to create partitions with approximately the same number of rows per partition. This isn't perfect, because the sky is uneven, but we still try to create smaller-area pixels in more dense areas, and larger-area pixels in less dense areas. We use the argument `pixel_threshold` and will split a partition into smaller healpix pixels until the number of rows is smaller than `pixel_threshold`.\n", + "When creating a new catalog through the hats-import process, we try to create partitions with approximately the same number of rows per partition. This isn't perfect, because the sky is uneven, but we still try to create smaller-area pixels in more dense areas, and larger-area pixels in less dense areas. We use the argument `pixel_threshold` and will split a partition into smaller healpix pixels until the number of rows is smaller than `pixel_threshold`.\n", "\n", "We do this to increase parallelization of reads and downstream analysis: if the files are around the same size, and operations on each partition take around the same amount of time, we're not as likely to be waiting on a single process to complete for the whole pipeline to complete.\n", "\n", @@ -19,7 +19,7 @@ "\n", "**Objective**\n", "\n", - "In this notebook, we'll go over *one* strategy for estimating the `pixel_threshold` argument you can use when importing a new catalog into hipscat format.\n", + "In this notebook, we'll go over *one* strategy for estimating the `pixel_threshold` argument you can use when importing a new catalog into hats format.\n", "\n", "This is not guaranteed to give you optimal results, but it could give you some hints toward *better* results." ] @@ -60,10 +60,10 @@ "metadata": {}, "outputs": [], "source": [ - "from hipscat_import.catalog.file_readers import CsvReader\n", + "from hats_import.catalog.file_readers import CsvReader\n", "\n", "### Change this path!!!\n", - "input_file = \"../../tests/hipscat_import/data/small_sky/catalog.csv\"\n", + "input_file = \"../../tests/data/small_sky/catalog.csv\"\n", "\n", "file_reader = CsvReader(chunksize=5_000)\n", "\n", @@ -77,7 +77,7 @@ "source": [ "## Inspect parquet file and metadata\n", "\n", - "Now that we have parsed our survey data into parquet, we can check what the data will look like when it's imported into hipscat format.\n", + "Now that we have parsed our survey data into parquet, we can check what the data will look like when it's imported into hats format.\n", "\n", "If you're just here to get a naive estimate for your pixel threshold, we'll do that first, then take a look at some other parquet characteristics later for the curious." ] @@ -161,7 +161,7 @@ "\n", "Below, we inspect the row and column group metadata to show the compressed size of the fields on disk. The last column, `percent`, show the percent of total size taken up by the column.\n", "\n", - "You *can* use this to inform which columns you keep when importing a catalog into hipscat format. e.g. if some columns are less useful for your science, and take up a lot of space, maybe leave them out!" + "You *can* use this to inform which columns you keep when importing a catalog into hats format. e.g. if some columns are less useful for your science, and take up a lot of space, maybe leave them out!" ] }, { @@ -192,7 +192,7 @@ ], "metadata": { "kernelspec": { - "display_name": "hipscatenv", + "display_name": "hatsenv", "language": "python", "name": "python3" }, diff --git a/docs/notebooks/unequal_schema.ipynb b/docs/notebooks/unequal_schema.ipynb index 3fc39e0a..664b448b 100644 --- a/docs/notebooks/unequal_schema.ipynb +++ b/docs/notebooks/unequal_schema.ipynb @@ -67,11 +67,11 @@ "import os\n", "from dask.distributed import Client\n", "\n", - "from hipscat_import.pipeline import pipeline_with_client\n", - "from hipscat_import.catalog.arguments import ImportArguments\n", - "from hipscat_import.catalog.file_readers import get_file_reader\n", + "from hats_import.pipeline import pipeline_with_client\n", + "from hats_import.catalog.arguments import ImportArguments\n", + "from hats_import.catalog.file_readers import get_file_reader\n", "\n", - "mixed_schema_csv_dir = \"../../tests/hipscat_import/data/mixed_schema\"\n", + "mixed_schema_csv_dir = \"../../tests/data/mixed_schema\"\n", "tmp_path = tempfile.TemporaryDirectory()\n", "\n", "args = ImportArguments(\n", @@ -110,7 +110,7 @@ "source": [ "import pyarrow.parquet as pq\n", "\n", - "mixed_schema_csv_parquet = \"../../tests/hipscat_import/data/mixed_schema/schema.parquet\"\n", + "mixed_schema_csv_parquet = \"../../tests/data/mixed_schema/schema.parquet\"\n", "\n", "parquet_file = pq.ParquetFile(mixed_schema_csv_parquet)\n", "print(parquet_file.schema)" @@ -294,7 +294,7 @@ ], "metadata": { "kernelspec": { - "display_name": "hipscatenv", + "display_name": "hatsenv", "language": "python", "name": "python3" }, diff --git a/docs/requirements.txt b/docs/requirements.txt index 11f126d9..80207354 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -9,4 +9,4 @@ sphinx sphinx-autoapi sphinx-copybutton sphinx-book-theme -git+https://github.com/astronomy-commons/hipscat.git@main \ No newline at end of file +git+https://github.com/astronomy-commons/hats.git@main \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 9be58956..72abf996 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "hipscat-import" +name = "hats-import" license = {file = "LICENSE"} readme = "README.md" authors = [ @@ -17,7 +17,7 @@ dynamic = ["version"] dependencies = [ "dask[complete]>=2024.3.0", # Includes dask expressions. "deprecated", - "hipscat >=0.3.8", + "hats >=0.4", "ipykernel", # Support for Jupyter notebooks "numpy", "pandas", @@ -40,7 +40,6 @@ dev = [ "pytest", "pytest-cov", "pytest-timeout", - "mypy", # Used for static type checking of files "ray", # Used for dask-on-ray testing. "types-PyYAML", # type stubs for pyyaml ] @@ -53,10 +52,10 @@ requires = [ build-backend = "setuptools.build_meta" [tool.setuptools_scm] -write_to = "src/hipscat_import/_version.py" +write_to = "src/hats_import/_version.py" [tool.setuptools.package-data] -hipscat_import = ["py.typed"] +hats_import = ["py.typed"] [tool.pytest.ini_options] timeout = 1 @@ -69,8 +68,8 @@ testpaths = [ [tool.coverage.report] omit = [ - "src/hipscat_import/_version.py", # auto-generated - "src/hipscat_import/pipeline.py", # too annoying to test + "src/hats_import/_version.py", # auto-generated + "src/hats_import/pipeline.py", # too annoying to test ] [tool.black] @@ -129,6 +128,6 @@ ignore = [ [tool.coverage.run] omit = [ - "src/hipscat_import/_version.py", # auto-generated - "src/hipscat_import/pipeline.py", # too annoying to test + "src/hats_import/_version.py", # auto-generated + "src/hats_import/pipeline.py", # too annoying to test ] diff --git a/requirements.txt b/requirements.txt index 124b2043..d405b240 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -git+https://github.com/astronomy-commons/hipscat.git@main \ No newline at end of file +git+https://github.com/astronomy-commons/hats.git@main \ No newline at end of file diff --git a/src/.pylintrc b/src/.pylintrc index 25eff61d..4bf8b358 100644 --- a/src/.pylintrc +++ b/src/.pylintrc @@ -280,6 +280,8 @@ ignored-parents= # Maximum number of arguments for function / method. max-args=10 +max-positional-arguments=15 + # Maximum number of attributes for a class (see R0902). max-attributes=20 diff --git a/src/hipscat_import/__init__.py b/src/hats_import/__init__.py similarity index 64% rename from src/hipscat_import/__init__.py rename to src/hats_import/__init__.py index ccdbd851..e8990428 100644 --- a/src/hipscat_import/__init__.py +++ b/src/hats_import/__init__.py @@ -1,4 +1,4 @@ -"""All modules for hipscat-import package""" +"""All modules for hats-import package""" from ._version import __version__ from .runtime_arguments import RuntimeArguments diff --git a/src/hipscat_import/catalog/__init__.py b/src/hats_import/catalog/__init__.py similarity index 100% rename from src/hipscat_import/catalog/__init__.py rename to src/hats_import/catalog/__init__.py diff --git a/src/hipscat_import/catalog/arguments.py b/src/hats_import/catalog/arguments.py similarity index 68% rename from src/hipscat_import/catalog/arguments.py rename to src/hats_import/catalog/arguments.py index 89d0c144..51af2724 100644 --- a/src/hipscat_import/catalog/arguments.py +++ b/src/hats_import/catalog/arguments.py @@ -6,12 +6,12 @@ from pathlib import Path from typing import List -from hipscat.catalog.catalog import CatalogInfo -from hipscat.pixel_math import hipscat_id +from hats.catalog import TableProperties +from hats.pixel_math import spatial_index from upath import UPath -from hipscat_import.catalog.file_readers import InputReader, get_file_reader -from hipscat_import.runtime_arguments import RuntimeArguments, find_input_paths +from hats_import.catalog.file_readers import InputReader, get_file_reader +from hats_import.runtime_arguments import RuntimeArguments, find_input_paths # pylint: disable=too-many-locals,too-many-arguments,too-many-instance-attributes,too-many-branches,too-few-public-methods @@ -20,9 +20,6 @@ class ImportArguments(RuntimeArguments): """Container class for holding partitioning arguments""" - epoch: str = "J2000" - """astronomical epoch for the data. defaults to "J2000" """ - catalog_type: str = "object" """level of catalog data, object (things in the sky) or source (detections)""" input_path: str | Path | UPath | None = None @@ -36,14 +33,14 @@ class ImportArguments(RuntimeArguments): """column for right ascension""" dec_column: str = "dec" """column for declination""" - use_hipscat_index: bool = False - """use an existing hipscat spatial index as the position, instead of ra/dec""" + use_healpix_29: bool = False + """use an existing healpix-based hats spatial index as the position, instead of ra/dec""" sort_columns: str | None = None """column for survey identifier, or other sortable column. if sorting by multiple columns, - they should be comma-separated. if `add_hipscat_index=True`, this sorting will be used to - resolve the counter within the same higher-order pixel space""" - add_hipscat_index: bool = True - """add the hipscat spatial index field alongside the data""" + they should be comma-separated. If `add_healpix_29=True`, `_healpix_29` will be the primary sort key, + but the provided sorting will be used for any rows within the same higher-order pixel space.""" + add_healpix_29: bool = True + """add the healpix-based hats spatial index field alongside the data""" use_schema_file: str | Path | UPath | None = None """path to a parquet file with schema metadata. this will be used for column metadata when writing the files, if specified""" @@ -74,13 +71,6 @@ class ImportArguments(RuntimeArguments): """healpix order to use when mapping. will be ``highest_healpix_order`` unless a positive value is provided for ``constant_healpix_order``""" - delete_intermediate_parquet_files: bool = True - """should we delete the smaller intermediate parquet files generated in the - splitting stage, once the relevant reducing stage is complete?""" - delete_resume_log_files: bool = True - """should we delete task-level done files once each stage is complete? - if False, we will keep all sub-histograms from the mapping stage, and all - done marker files at the end of the pipeline.""" run_stages: List[str] = field(default_factory=list) """list of parallel stages to run. options are ['mapping', 'splitting', 'reducing', 'finishing']. ['planning', 'binning'] stages are not optional. @@ -123,53 +113,34 @@ def _check_arguments(self): if isinstance(self.file_reader, str): self.file_reader = get_file_reader(self.file_reader) - if self.use_hipscat_index: - self.add_hipscat_index = False + if self.use_healpix_29: + self.add_healpix_29 = False if self.sort_columns: - raise ValueError("When using _hipscat_index for position, no sort columns should be added") + raise ValueError("When using _healpix_29 for position, no sort columns should be added") # Basic checks complete - make more checks and create directories where necessary self.input_paths = find_input_paths(self.input_path, "**/*.*", self.input_file_list) - def to_catalog_info(self, total_rows) -> CatalogInfo: + def to_table_properties( + self, total_rows: int, highest_order: int, moc_sky_fraction: float + ) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, "catalog_type": self.catalog_type, "total_rows": total_rows, - "epoch": self.epoch, - "ra_column": self.ra_column, - "dec_column": self.dec_column, - } - return CatalogInfo(**info) - - def additional_runtime_provenance_info(self) -> dict: - file_reader_info = {"type": self.file_reader} - if isinstance(self.file_reader, InputReader): - file_reader_info = self.file_reader.provenance_info() - return { - "catalog_name": self.output_artifact_name, - "epoch": self.epoch, - "catalog_type": self.catalog_type, - "input_path": self.input_path, - "input_paths": self.input_paths, - "input_file_list": self.input_file_list, "ra_column": self.ra_column, "dec_column": self.dec_column, - "use_hipscat_index": self.use_hipscat_index, - "sort_columns": self.sort_columns, - "constant_healpix_order": self.constant_healpix_order, - "lowest_healpix_order": self.lowest_healpix_order, - "highest_healpix_order": self.highest_healpix_order, - "pixel_threshold": self.pixel_threshold, - "mapping_healpix_order": self.mapping_healpix_order, - "debug_stats_only": self.debug_stats_only, - "file_reader_info": file_reader_info, - } + "hats_cols_sort": self.sort_columns, + "hats_max_rows": self.pixel_threshold, + "hats_order": highest_order, + "moc_sky_fraction": f"{moc_sky_fraction:0.5f}", + } | self.extra_property_dict() + return TableProperties(**info) def check_healpix_order_range( - order, field_name, lower_bound=0, upper_bound=hipscat_id.HIPSCAT_ID_HEALPIX_ORDER + order, field_name, lower_bound=0, upper_bound=spatial_index.SPATIAL_INDEX_ORDER ): """Helper method to check if the ``order`` is within the range determined by the ``lower_bound`` and ``upper_bound``, inclusive. @@ -185,7 +156,7 @@ def check_healpix_order_range( """ if lower_bound < 0: raise ValueError("healpix orders must be positive") - if upper_bound > hipscat_id.HIPSCAT_ID_HEALPIX_ORDER: - raise ValueError(f"healpix order should be <= {hipscat_id.HIPSCAT_ID_HEALPIX_ORDER}") + if upper_bound > spatial_index.SPATIAL_INDEX_ORDER: + raise ValueError(f"healpix order should be <= {spatial_index.SPATIAL_INDEX_ORDER}") if not lower_bound <= order <= upper_bound: raise ValueError(f"{field_name} should be between {lower_bound} and {upper_bound}") diff --git a/src/hipscat_import/catalog/file_readers.py b/src/hats_import/catalog/file_readers.py similarity index 95% rename from src/hipscat_import/catalog/file_readers.py rename to src/hats_import/catalog/file_readers.py index 300717d9..40c05be5 100644 --- a/src/hipscat_import/catalog/file_readers.py +++ b/src/hats_import/catalog/file_readers.py @@ -8,7 +8,7 @@ import pyarrow.parquet as pq from astropy.io import ascii as ascii_reader from astropy.table import Table -from hipscat.io import file_io +from hats.io import file_io from upath import UPath # pylint: disable=too-few-public-methods,too-many-arguments @@ -98,21 +98,6 @@ def read(self, input_file, read_columns=None): DataFrame containing chunk of file info. """ - def provenance_info(self) -> dict: - """Create dictionary of parameters for provenance tracking. - - If any `storage_options` have been provided as kwargs, we will replace the - value with ``REDACTED`` for the purpose of writing to provenance info, as it - may contain user names or API keys. - - Returns: - dictionary with all argument_name -> argument_value as key -> value pairs. - """ - all_args = vars(self) - if "kwargs" in all_args and "storage_options" in all_args["kwargs"]: - all_args["kwargs"]["storage_options"] = "REDACTED" - return {"input_reader_type": type(self).__name__, **vars(self)} - def regular_file_exists(self, input_file, **_kwargs): """Check that the `input_file` points to a single regular file diff --git a/src/hipscat_import/catalog/map_reduce.py b/src/hats_import/catalog/map_reduce.py similarity index 82% rename from src/hipscat_import/catalog/map_reduce.py rename to src/hats_import/catalog/map_reduce.py index 799c3339..82027894 100644 --- a/src/hipscat_import/catalog/map_reduce.py +++ b/src/hats_import/catalog/map_reduce.py @@ -1,20 +1,20 @@ -"""Import a set of non-hipscat files using dask for parallelization""" +"""Import a set of non-hats files using dask for parallelization""" import pickle -import hipscat.pixel_math.healpix_shim as hp +import hats.pixel_math.healpix_shim as hp import numpy as np import pyarrow as pa import pyarrow.parquet as pq -from hipscat import pixel_math -from hipscat.io import file_io, paths -from hipscat.pixel_math.healpix_pixel import HealpixPixel -from hipscat.pixel_math.hipscat_id import HIPSCAT_ID_COLUMN, hipscat_id_to_healpix +from hats import pixel_math +from hats.io import file_io, paths +from hats.pixel_math.healpix_pixel import HealpixPixel +from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN, spatial_index_to_healpix from upath import UPath -from hipscat_import.catalog.resume_plan import ResumePlan -from hipscat_import.catalog.sparse_histogram import SparseHistogram -from hipscat_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure +from hats_import.catalog.resume_plan import ResumePlan +from hats_import.catalog.sparse_histogram import SparseHistogram +from hats_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure # pylint: disable=too-many-locals,too-many-arguments @@ -39,7 +39,7 @@ def _iterate_input_file( highest_order, ra_column, dec_column, - use_hipscat_index=False, + use_healpix_29=False, read_columns=None, ): """Helper function to handle input file reading and healpix pixel calculation""" @@ -49,11 +49,13 @@ def _iterate_input_file( raise NotImplementedError("No file reader implemented") for chunk_number, data in enumerate(file_reader.read(input_file, read_columns=read_columns)): - if use_hipscat_index: - if data.index.name == HIPSCAT_ID_COLUMN: - mapped_pixels = hipscat_id_to_healpix(data.index, target_order=highest_order) + if use_healpix_29: + if data.index.name == SPATIAL_INDEX_COLUMN: + mapped_pixels = spatial_index_to_healpix(data.index, target_order=highest_order) else: - mapped_pixels = hipscat_id_to_healpix(data[HIPSCAT_ID_COLUMN], target_order=highest_order) + mapped_pixels = spatial_index_to_healpix( + data[SPATIAL_INDEX_COLUMN], target_order=highest_order + ) else: # Set up the pixel data mapped_pixels = hp.ang2pix( @@ -74,13 +76,13 @@ def map_to_pixels( highest_order, ra_column, dec_column, - use_hipscat_index=False, + use_healpix_29=False, ): """Map a file of input objects to their healpix pixels. Args: input_file (UPath): file to read for catalog data. - file_reader (hipscat_import.catalog.file_readers.InputReader): instance of input + file_reader (hats_import.catalog.file_readers.InputReader): instance of input reader that specifies arguments necessary for reading from the input file. resume_path (UPath): where to write resume partial results. mapping_key (str): unique counter for this input file, used @@ -99,8 +101,8 @@ def map_to_pixels( try: histo = SparseHistogram.make_empty(highest_order) - if use_hipscat_index: - read_columns = [HIPSCAT_ID_COLUMN] + if use_healpix_29: + read_columns = [SPATIAL_INDEX_COLUMN] else: read_columns = [ra_column, dec_column] @@ -110,7 +112,7 @@ def map_to_pixels( highest_order, ra_column, dec_column, - use_hipscat_index, + use_healpix_29, read_columns, ): mapped_pixel, count_at_pixel = np.unique(mapped_pixels, return_counts=True) @@ -136,13 +138,13 @@ def split_pixels( cache_shard_path: UPath, resume_path: UPath, alignment_file=None, - use_hipscat_index=False, + use_healpix_29=False, ): """Map a file of input objects to their healpix pixels and split into shards. Args: input_file (UPath): file to read for catalog data. - file_reader (hipscat_import.catalog.file_readers.InputReader): instance + file_reader (hats_import.catalog.file_readers.InputReader): instance of input reader that specifies arguments necessary for reading from the input file. splitting_key (str): unique counter for this input file, used when creating intermediate files @@ -160,16 +162,13 @@ def split_pixels( with open(alignment_file, "rb") as pickle_file: alignment = pickle.load(pickle_file) for chunk_number, data, mapped_pixels in _iterate_input_file( - input_file, pickled_reader_file, highest_order, ra_column, dec_column, use_hipscat_index + input_file, pickled_reader_file, highest_order, ra_column, dec_column, use_healpix_29 ): aligned_pixels = alignment[mapped_pixels] unique_pixels, unique_inverse = np.unique(aligned_pixels, return_inverse=True) for unique_index, [order, pixel, _] in enumerate(unique_pixels): - mapped_indexes = np.where(unique_inverse == unique_index) - data_indexes = data.index[mapped_indexes[0].tolist()] - - filtered_data = data.filter(items=data_indexes, axis=0) + filtered_data = data.iloc[unique_inverse == unique_index] pixel_dir = get_pixel_cache_directory(cache_shard_path, HealpixPixel(order, pixel)) file_io.make_directory(pixel_dir, exist_ok=True) @@ -180,7 +179,7 @@ def split_pixels( filtered_data.to_parquet(output_file.path, index=True, filesystem=output_file.fs) else: filtered_data.to_parquet(output_file.path, index=False, filesystem=output_file.fs) - del filtered_data, data_indexes + del filtered_data ResumePlan.splitting_key_done(tmp_path=resume_path, splitting_key=splitting_key) except Exception as exception: # pylint: disable=broad-exception-caught @@ -199,8 +198,8 @@ def reduce_pixel_shards( ra_column, dec_column, sort_columns: str = "", - use_hipscat_index=False, - add_hipscat_index=True, + use_healpix_29=False, + add_healpix_29=True, delete_input_files=True, use_schema_file="", ): @@ -212,15 +211,15 @@ def reduce_pixel_shards( - ``Norder`` - the healpix order for the pixel - ``Dir`` - the directory part, corresponding to the pixel - ``Npix`` - the healpix pixel - - ``_hipscat_index`` - optional - a spatially-correlated + - ``_healpix_29`` - optional - a spatially-correlated 64-bit index field. - Notes on ``_hipscat_index``: + Notes on ``_healpix_29``: - if we generate the field, we will promote any previous *named* pandas index field(s) to a column with that name. - - see ``hipscat.pixel_math.hipscat_id`` + - see ``hats.pixel_math.spatial_index`` for more in-depth discussion of this field. Args: @@ -235,7 +234,7 @@ def reduce_pixel_shards( for the catalog's final pixel output_path (UPath): where to write the final catalog pixel data sort_columns (str): column for survey identifier, or other sortable column - add_hipscat_index (bool): should we add a _hipscat_index column to + add_healpix_29 (bool): should we add a _healpix_29 column to the resulting parquet file? delete_input_files (bool): should we delete the intermediate files used as input for this method. @@ -281,22 +280,22 @@ def reduce_pixel_shards( dataframe = merged_table.to_pandas() if sort_columns: dataframe = dataframe.sort_values(sort_columns.split(",")) - if add_hipscat_index: + if add_healpix_29: ## If we had a meaningful index before, preserve it as a column. if _has_named_index(dataframe): dataframe = dataframe.reset_index() - dataframe[HIPSCAT_ID_COLUMN] = pixel_math.compute_hipscat_id( + dataframe[SPATIAL_INDEX_COLUMN] = pixel_math.compute_spatial_index( dataframe[ra_column].values, dataframe[dec_column].values, ) - dataframe = dataframe.set_index(HIPSCAT_ID_COLUMN).sort_index() + dataframe = dataframe.set_index(SPATIAL_INDEX_COLUMN).sort_index() - # Adjust the schema to make sure that the _hipscat_index will + # Adjust the schema to make sure that the _healpix_29 will # be saved as a uint64 - elif use_hipscat_index: - if dataframe.index.name != HIPSCAT_ID_COLUMN: - dataframe = dataframe.set_index(HIPSCAT_ID_COLUMN) + elif use_healpix_29: + if dataframe.index.name != SPATIAL_INDEX_COLUMN: + dataframe = dataframe.set_index(SPATIAL_INDEX_COLUMN) dataframe = dataframe.sort_index() dataframe["Norder"] = np.full(rows_written, fill_value=healpix_pixel.order, dtype=np.uint8) @@ -304,7 +303,7 @@ def reduce_pixel_shards( dataframe["Npix"] = np.full(rows_written, fill_value=healpix_pixel.pixel, dtype=np.uint64) if schema: - schema = _modify_arrow_schema(schema, add_hipscat_index) + schema = _modify_arrow_schema(schema, add_healpix_29) dataframe.to_parquet(destination_file.path, schema=schema, filesystem=destination_file.fs) else: dataframe.to_parquet(destination_file.path, filesystem=destination_file.fs) @@ -325,12 +324,12 @@ def reduce_pixel_shards( raise exception -def _modify_arrow_schema(schema, add_hipscat_index): - if add_hipscat_index: +def _modify_arrow_schema(schema, add_healpix_29): + if add_healpix_29: pandas_index_column = schema.get_field_index("__index_level_0__") if pandas_index_column != -1: schema = schema.remove(pandas_index_column) - schema = schema.insert(0, pa.field(HIPSCAT_ID_COLUMN, pa.uint64())) + schema = schema.insert(0, pa.field(SPATIAL_INDEX_COLUMN, pa.int64())) schema = ( schema.append(pa.field("Norder", pa.uint8())) .append(pa.field("Dir", pa.uint64())) diff --git a/src/hipscat_import/catalog/resume_plan.py b/src/hats_import/catalog/resume_plan.py similarity index 95% rename from src/hipscat_import/catalog/resume_plan.py rename to src/hats_import/catalog/resume_plan.py index 03acb7c9..aa7221cb 100644 --- a/src/hipscat_import/catalog/resume_plan.py +++ b/src/hats_import/catalog/resume_plan.py @@ -6,17 +6,17 @@ from dataclasses import dataclass, field from typing import List, Optional, Tuple -import hipscat.pixel_math.healpix_shim as hp +import hats.pixel_math.healpix_shim as hp import numpy as np -from hipscat import pixel_math -from hipscat.io import file_io -from hipscat.pixel_math import empty_histogram -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats import pixel_math +from hats.io import file_io +from hats.pixel_math import empty_histogram +from hats.pixel_math.healpix_pixel import HealpixPixel from numpy import frombuffer from upath import UPath -from hipscat_import.catalog.sparse_histogram import SparseHistogram -from hipscat_import.pipeline_resume_plan import PipelineResumePlan +from hats_import.catalog.sparse_histogram import SparseHistogram +from hats_import.pipeline_resume_plan import PipelineResumePlan @dataclass @@ -59,15 +59,7 @@ def __init__( import_args=None, ): if import_args: - super().__init__( - resume=import_args.resume, - progress_bar=import_args.progress_bar, - simple_progress_bar=import_args.simple_progress_bar, - tmp_path=import_args.resume_tmp, - tmp_base_path=import_args.tmp_base_path, - delete_resume_log_files=import_args.delete_resume_log_files, - delete_intermediate_parquet_files=import_args.delete_intermediate_parquet_files, - ) + super().__init__(**import_args.resume_kwargs_dict()) if import_args.debug_stats_only: run_stages = ["mapping", "finishing"] self.input_paths = import_args.input_paths diff --git a/src/hipscat_import/catalog/run_import.py b/src/hats_import/catalog/run_import.py similarity index 81% rename from src/hipscat_import/catalog/run_import.py rename to src/hats_import/catalog/run_import.py index 20ca721f..745a367a 100644 --- a/src/hipscat_import/catalog/run_import.py +++ b/src/hats_import/catalog/run_import.py @@ -1,4 +1,4 @@ -"""Import a set of non-hipscat files using dask for parallelization +"""Import a set of non-hats files using dask for parallelization Methods in this file set up a dask pipeline using futures. The actual logic of the map reduce is in the `map_reduce.py` file. @@ -7,14 +7,14 @@ import os import pickle -import hipscat.io.write_metadata as io -from hipscat.catalog import PartitionInfo -from hipscat.io import paths -from hipscat.io.parquet_metadata import write_parquet_metadata +import hats.io.file_io as io +from hats.catalog import PartitionInfo +from hats.io import paths +from hats.io.parquet_metadata import write_parquet_metadata -import hipscat_import.catalog.map_reduce as mr -from hipscat_import.catalog.arguments import ImportArguments -from hipscat_import.catalog.resume_plan import ResumePlan +import hats_import.catalog.map_reduce as mr +from hats_import.catalog.arguments import ImportArguments +from hats_import.catalog.resume_plan import ResumePlan def run(args, client): @@ -43,7 +43,7 @@ def run(args, client): highest_order=args.mapping_healpix_order, ra_column=args.ra_column, dec_column=args.dec_column, - use_hipscat_index=args.use_hipscat_index, + use_healpix_29=args.use_healpix_29, ) ) resume_plan.wait_for_mapping(futures) @@ -84,7 +84,7 @@ def run(args, client): cache_shard_path=args.tmp_path, resume_path=resume_plan.tmp_path, alignment_file=alignment_file, - use_hipscat_index=args.use_hipscat_index, + use_healpix_29=args.use_healpix_29, ) ) @@ -110,9 +110,9 @@ def run(args, client): ra_column=args.ra_column, dec_column=args.dec_column, sort_columns=args.sort_columns, - add_hipscat_index=args.add_hipscat_index, + add_healpix_29=args.add_healpix_29, use_schema_file=args.use_schema_file, - use_hipscat_index=args.use_hipscat_index, + use_healpix_29=args.use_healpix_29, delete_input_files=args.delete_intermediate_parquet_files, ) ) @@ -121,17 +121,7 @@ def run(args, client): # All done - write out the metadata if resume_plan.should_run_finishing: - with resume_plan.print_progress(total=5, stage_name="Finishing") as step_progress: - catalog_info = args.to_catalog_info(total_rows) - io.write_provenance_info( - catalog_base_dir=args.catalog_path, - dataset_info=catalog_info, - tool_args=args.provenance_info(), - ) - step_progress.update(1) - - io.write_catalog_info(catalog_base_dir=args.catalog_path, dataset_info=catalog_info) - step_progress.update(1) + with resume_plan.print_progress(total=4, stage_name="Finishing") as step_progress: partition_info = PartitionInfo.from_healpix(resume_plan.get_destination_pixels()) partition_info_file = paths.get_partition_info_pointer(args.catalog_path) partition_info.write_to_file(partition_info_file) @@ -145,7 +135,12 @@ def run(args, client): else: partition_info.write_to_metadata_files(args.catalog_path) step_progress.update(1) - io.write_fits_map(args.catalog_path, raw_histogram) + catalog_info = args.to_table_properties( + total_rows, partition_info.get_highest_order(), partition_info.calculate_fractional_coverage() + ) + catalog_info.to_properties_file(args.catalog_path) + step_progress.update(1) + io.write_fits_image(raw_histogram, paths.get_point_map_file_pointer(args.catalog_path)) step_progress.update(1) resume_plan.clean_resume_files() step_progress.update(1) diff --git a/src/hipscat_import/catalog/sparse_histogram.py b/src/hats_import/catalog/sparse_histogram.py similarity index 98% rename from src/hipscat_import/catalog/sparse_histogram.py rename to src/hats_import/catalog/sparse_histogram.py index ac1549ae..0ed130bb 100644 --- a/src/hipscat_import/catalog/sparse_histogram.py +++ b/src/hats_import/catalog/sparse_histogram.py @@ -1,6 +1,6 @@ """Sparse 1-D histogram of healpix pixel counts.""" -import hipscat.pixel_math.healpix_shim as hp +import hats.pixel_math.healpix_shim as hp import numpy as np from scipy.sparse import csc_array, load_npz, save_npz, sparray diff --git a/src/hats_import/hipscat_conversion/__init__.py b/src/hats_import/hipscat_conversion/__init__.py new file mode 100644 index 00000000..c0086edf --- /dev/null +++ b/src/hats_import/hipscat_conversion/__init__.py @@ -0,0 +1,4 @@ +"""Convert a hipscatted catalog into a HATS catalog, with appropriate metadata/properties.""" + +from .arguments import ConversionArguments +from .run_conversion import run diff --git a/src/hats_import/hipscat_conversion/arguments.py b/src/hats_import/hipscat_conversion/arguments.py new file mode 100644 index 00000000..34e8fc33 --- /dev/null +++ b/src/hats_import/hipscat_conversion/arguments.py @@ -0,0 +1,28 @@ +"""Utility to hold all arguments required throughout hipscat -> hats conversion""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from hats.io import file_io +from upath import UPath + +from hats_import.runtime_arguments import RuntimeArguments + + +@dataclass +class ConversionArguments(RuntimeArguments): + """Data class for holding conversion arguments. Mostly just inheriting from RuntimeArguments""" + + ## Input + input_catalog_path: str | Path | UPath | None = None + + def __post_init__(self): + self._check_arguments() + + def _check_arguments(self): + super()._check_arguments() + if not self.input_catalog_path: + raise ValueError("input_catalog_path is required") + self.input_catalog_path = file_io.get_upath(self.input_catalog_path) diff --git a/src/hats_import/hipscat_conversion/run_conversion.py b/src/hats_import/hipscat_conversion/run_conversion.py new file mode 100644 index 00000000..be64835c --- /dev/null +++ b/src/hats_import/hipscat_conversion/run_conversion.py @@ -0,0 +1,177 @@ +"""Convert a hipscatted catalog into a HATS catalog, with appropriate metadata/properties.""" + +import json +import tempfile +from typing import no_type_check + +import hats.pixel_math.healpix_shim as hp +import numpy as np +import pyarrow.parquet as pq +from dask.distributed import as_completed, get_worker +from dask.distributed import print as dask_print +from hats.catalog import CatalogType, PartitionInfo, TableProperties +from hats.io import file_io, parquet_metadata, paths + +import hats_import +from hats_import.hipscat_conversion.arguments import ConversionArguments +from hats_import.pipeline_resume_plan import print_progress +from hats_import.runtime_arguments import _estimate_dir_size + + +@no_type_check +def run(args: ConversionArguments, client): + """Run index creation pipeline.""" + if not args: + raise TypeError("args is required and should be type ConversionArguments") + if not isinstance(args, ConversionArguments): + raise TypeError("args must be type ConversionArguments") + + # Create basic properties, using catalog info, provenance info, and partition_info files + catalog_info = None + with (args.input_catalog_path / "catalog_info.json").open("r", encoding="utf-8") as json_file: + catalog_info = json.load(json_file) + provenance_info = None + with (args.input_catalog_path / "provenance_info.json").open("r", encoding="utf-8") as json_file: + provenance_info = json.load(json_file) + + catalog_type = CatalogType(catalog_info["catalog_type"]) + if catalog_type not in ( + CatalogType.OBJECT, + CatalogType.SOURCE, + CatalogType.MARGIN, + CatalogType.ASSOCIATION, + ): + raise ValueError("Conversion only implemented for object, source, margin, and association tables") + + catalog_info.pop("epoch", None) + catalog_info = catalog_info | args.extra_property_dict() + if "tool_args" in provenance_info: + builder_str = ( + provenance_info["tool_args"]["tool_name"] + + " v" + + provenance_info["tool_args"]["version"] + + " hats-importer conversion v" + + hats_import.__version__ + ) + catalog_info["hats_builder"] = builder_str + if runtime_args := provenance_info["tool_args"].get("runtime_args"): + catalog_info["hats_cols_sort"] = runtime_args.get("sort_columns") + catalog_info["hats_cols_survey_id"] = runtime_args.get("sort_columns") + catalog_info["hats_max_rows"] = runtime_args.get("pixel_threshold") + + partition_info = PartitionInfo.read_from_dir(args.input_catalog_path) + catalog_info["hats_order"] = partition_info.get_highest_order() + + properties = TableProperties(**catalog_info) + + schema = file_io.read_parquet_metadata( + args.input_catalog_path / "_common_metadata" + ).schema.to_arrow_schema() + + futures = [] + for pixel in partition_info.get_healpix_pixels(): + futures.append( + client.submit( + _convert_partition_file, pixel, args, schema, properties.ra_column, properties.dec_column + ) + ) + for future in print_progress( + as_completed(futures), + stage_name="Converting Parquet", + total=len(futures), + use_progress_bar=args.progress_bar, + simple_progress_bar=args.simple_progress_bar, + ): + if future.status == "error": + raise future.exception() + + with print_progress( + total=4, + stage_name="Finishing", + use_progress_bar=args.progress_bar, + simple_progress_bar=args.simple_progress_bar, + ) as step_progress: + total_rows = parquet_metadata.write_parquet_metadata(args.catalog_path) + if total_rows != properties.total_rows: + raise ValueError( + f"Unexpected number of rows (original: {properties.total_rows}" + f" written to parquet: {total_rows})" + ) + step_progress.update(1) + file_io.remove_directory(args.tmp_path, ignore_errors=True) + step_progress.update(1) + ## Update total size with newly-written parquet files. + properties.__pydantic_extra__["hats_estsize"] = int(_estimate_dir_size(args.catalog_path) / 1024) + properties.to_properties_file(args.catalog_path) + partition_info.write_to_file(args.catalog_path / "partition_info.csv") + step_progress.update(1) + _write_nested_fits_map(args.input_catalog_path, args.catalog_path) + step_progress.update(1) + + +def _convert_partition_file(pixel, args, schema, ra_column, dec_column): + try: + # Paths are changed between hipscat and HATS! + input_file = ( + args.input_catalog_path + / f"Norder={pixel.order}" + / f"Dir={pixel.dir}" + / f"Npix={pixel.pixel}.parquet" + ) + + table = pq.read_table(input_file, schema=schema) + num_rows = len(table) + + table = ( + table.drop_columns(["_hipscat_index", "Norder", "Dir", "Npix"]) + .add_column( + 0, + "_healpix_29", + [ + hp.ang2pix( + 2**29, + table[ra_column].to_numpy(), + table[dec_column].to_numpy(), + nest=True, + lonlat=True, + ) + ], + ) + .append_column("Norder", [np.full(num_rows, fill_value=pixel.order, dtype=np.int8)]) + .append_column("Dir", [np.full(num_rows, fill_value=pixel.dir, dtype=np.int64)]) + .append_column("Npix", [np.full(num_rows, fill_value=pixel.pixel, dtype=np.int64)]) + ) + + destination_file = paths.pixel_catalog_file(args.catalog_path, pixel) + destination_file.parent.mkdir(parents=True, exist_ok=True) + pq.write_table(table, destination_file.path, filesystem=destination_file.fs) + except Exception as exception: # pylint: disable=broad-exception-caught + try: + dask_print(" worker address:", get_worker().address) + except Exception: # pylint: disable=broad-exception-caught + pass + dask_print(exception) + + +def _write_nested_fits_map(input_dir, output_dir): + input_file = input_dir / "point_map.fits" + if not input_file.exists(): + return + with tempfile.NamedTemporaryFile() as _tmp_file: + with input_file.open("rb") as _map_file: + map_data = _map_file.read() + _tmp_file.write(map_data) + map_fits_image = hp.read_map(_tmp_file.name, nest=True, h=True) + header_dict = dict(map_fits_image[1]) + if header_dict["ORDERING"] != "NESTED": + map_fits_image = hp.read_map(_tmp_file.name) + else: + map_fits_image = map_fits_image[0] + + output_file = output_dir / "point_map.fits" + with tempfile.NamedTemporaryFile() as _tmp_file: + with output_file.open("wb") as _map_file: + hp.write_map( + _tmp_file.name, map_fits_image, overwrite=True, dtype=np.int32, nest=True, coord="CEL" + ) + _map_file.write(_tmp_file.read()) diff --git a/src/hipscat_import/index/__init__.py b/src/hats_import/index/__init__.py similarity index 54% rename from src/hipscat_import/index/__init__.py rename to src/hats_import/index/__init__.py index 008c9952..f59b54e8 100644 --- a/src/hipscat_import/index/__init__.py +++ b/src/hats_import/index/__init__.py @@ -1,4 +1,4 @@ -"""Create performance index for a single column of an already-hipscatted catalog""" +"""Create performance index for a single column of an already-hats-sharded catalog""" from .arguments import IndexArguments from .map_reduce import create_index diff --git a/src/hipscat_import/index/arguments.py b/src/hats_import/index/arguments.py similarity index 73% rename from src/hipscat_import/index/arguments.py rename to src/hats_import/index/arguments.py index 1fb6fa6b..aba9a700 100644 --- a/src/hipscat_import/index/arguments.py +++ b/src/hats_import/index/arguments.py @@ -6,12 +6,11 @@ from pathlib import Path from typing import List, Optional -from hipscat.catalog import Catalog -from hipscat.catalog.index.index_catalog_info import IndexCatalogInfo -from hipscat.io.validation import is_valid_catalog +from hats.catalog import Catalog, TableProperties +from hats.io.validation import is_valid_catalog from upath import UPath -from hipscat_import.runtime_arguments import RuntimeArguments +from hats_import.runtime_arguments import RuntimeArguments @dataclass @@ -25,8 +24,8 @@ class IndexArguments(RuntimeArguments): extra_columns: List[str] = field(default_factory=list) ## Output - include_hipscat_index: bool = True - """Include the hipscat spatial partition index.""" + include_healpix_29: bool = True + """Include the healpix-based hats spatial index.""" include_order_pixel: bool = True """Include partitioning columns, Norder, Dir, and Npix. You probably want to keep these!""" include_radec: bool = False @@ -57,12 +56,12 @@ def _check_arguments(self): if not self.indexing_column: raise ValueError("indexing_column is required") - if not self.include_hipscat_index and not self.include_order_pixel: - raise ValueError("At least one of include_hipscat_index or include_order_pixel must be True") + if not self.include_healpix_29 and not self.include_order_pixel: + raise ValueError("At least one of include_healpix_29 or include_order_pixel must be True") if not is_valid_catalog(self.input_catalog_path): raise ValueError("input_catalog_path not a valid catalog") - self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path) + self.input_catalog = Catalog.read_hats(catalog_path=self.input_catalog_path) if self.include_radec: catalog_info = self.input_catalog.catalog_info self.extra_columns.extend([catalog_info.ra_column, catalog_info.dec_column]) @@ -82,24 +81,15 @@ def _check_arguments(self): if self.compute_partition_size < 100_000: raise ValueError("compute_partition_size must be at least 100_000") - def to_catalog_info(self, total_rows) -> IndexCatalogInfo: + def to_table_properties(self, total_rows: int) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, - "total_rows": total_rows, "catalog_type": "index", - "primary_catalog": self.input_catalog_path, + "total_rows": total_rows, + "primary_catalog": str(self.input_catalog_path), "indexing_column": self.indexing_column, "extra_columns": self.extra_columns, - } - return IndexCatalogInfo(**info) + } | self.extra_property_dict() - def additional_runtime_provenance_info(self) -> dict: - return { - "input_catalog_path": self.input_catalog_path, - "indexing_column": self.indexing_column, - "extra_columns": self.extra_columns, - "include_hipscat_index": self.include_hipscat_index, - "include_order_pixel": self.include_order_pixel, - "include_radec": self.include_radec, - } + return TableProperties(**info) diff --git a/src/hipscat_import/index/map_reduce.py b/src/hats_import/index/map_reduce.py similarity index 82% rename from src/hipscat_import/index/map_reduce.py rename to src/hats_import/index/map_reduce.py index 8bba30ba..fef062a7 100644 --- a/src/hipscat_import/index/map_reduce.py +++ b/src/hats_import/index/map_reduce.py @@ -1,12 +1,12 @@ -"""Create columnar index of hipscat table using dask for parallelization""" +"""Create columnar index of hats table using dask for parallelization""" import dask.dataframe as dd import numpy as np -from hipscat.io import file_io, paths -from hipscat.pixel_math.hipscat_id import HIPSCAT_ID_COLUMN +from hats.io import file_io, paths +from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN -def read_leaf_file(input_file, include_columns, include_hipscat_index, drop_duplicates, schema): +def read_leaf_file(input_file, include_columns, include_healpix_29, drop_duplicates, schema): """Mapping function called once per input file. Reads the leaf parquet file, and returns with appropriate columns and duplicates dropped.""" @@ -18,8 +18,8 @@ def read_leaf_file(input_file, include_columns, include_hipscat_index, drop_dupl ) data = data.reset_index() - if not include_hipscat_index: - data = data.drop(columns=[HIPSCAT_ID_COLUMN]) + if not include_healpix_29: + data = data.drop(columns=[SPATIAL_INDEX_COLUMN]) if drop_duplicates: data = data.drop_duplicates() @@ -35,7 +35,7 @@ def create_index(args, client): if args.include_order_pixel: include_columns.extend(["Norder", "Dir", "Npix"]) - index_dir = file_io.get_upath(args.catalog_path / "index") + index_dir = file_io.get_upath(args.catalog_path / "dataset" / "index") data = dd.from_map( read_leaf_file, @@ -44,7 +44,7 @@ def create_index(args, client): for pixel in args.input_catalog.get_healpix_pixels() ], include_columns=include_columns, - include_hipscat_index=args.include_hipscat_index, + include_healpix_29=args.include_healpix_29, drop_duplicates=args.drop_duplicates, schema=args.input_catalog.schema, ) diff --git a/src/hipscat_import/index/run_index.py b/src/hats_import/index/run_index.py similarity index 52% rename from src/hipscat_import/index/run_index.py rename to src/hats_import/index/run_index.py index fc324af2..2547e883 100644 --- a/src/hipscat_import/index/run_index.py +++ b/src/hats_import/index/run_index.py @@ -1,10 +1,10 @@ -"""Create columnar index of hipscat table using dask for parallelization""" +"""Create columnar index of hats table using dask for parallelization""" -from hipscat.io import file_io, parquet_metadata, write_metadata +from hats.io import file_io, parquet_metadata -import hipscat_import.index.map_reduce as mr -from hipscat_import.index.arguments import IndexArguments -from hipscat_import.pipeline_resume_plan import print_progress +import hats_import.index.map_reduce as mr +from hats_import.index.arguments import IndexArguments +from hats_import.pipeline_resume_plan import print_progress def run(args, client): @@ -17,19 +17,13 @@ def run(args, client): # All done - write out the metadata with print_progress( - total=4, + total=3, stage_name="Finishing", use_progress_bar=args.progress_bar, simple_progress_bar=args.simple_progress_bar, ) as step_progress: - index_catalog_info = args.to_catalog_info(int(rows_written)) - write_metadata.write_provenance_info( - catalog_base_dir=args.catalog_path, - dataset_info=index_catalog_info, - tool_args=args.provenance_info(), - ) - step_progress.update(1) - write_metadata.write_catalog_info(catalog_base_dir=args.catalog_path, dataset_info=index_catalog_info) + index_catalog_info = args.to_table_properties(int(rows_written)) + index_catalog_info.to_properties_file(args.catalog_path) step_progress.update(1) file_io.remove_directory(args.tmp_path, ignore_errors=True) step_progress.update(1) diff --git a/src/hipscat_import/margin_cache/__init__.py b/src/hats_import/margin_cache/__init__.py similarity index 100% rename from src/hipscat_import/margin_cache/__init__.py rename to src/hats_import/margin_cache/__init__.py diff --git a/src/hipscat_import/margin_cache/margin_cache.py b/src/hats_import/margin_cache/margin_cache.py similarity index 80% rename from src/hipscat_import/margin_cache/margin_cache.py rename to src/hats_import/margin_cache/margin_cache.py index c9815139..4042cde1 100644 --- a/src/hipscat_import/margin_cache/margin_cache.py +++ b/src/hats_import/margin_cache/margin_cache.py @@ -1,15 +1,15 @@ -from hipscat.catalog import PartitionInfo -from hipscat.io import file_io, parquet_metadata, paths, write_metadata +from hats.catalog import PartitionInfo +from hats.io import file_io, parquet_metadata, paths -import hipscat_import.margin_cache.margin_cache_map_reduce as mcmr -from hipscat_import.margin_cache.margin_cache_resume_plan import MarginCachePlan +import hats_import.margin_cache.margin_cache_map_reduce as mcmr +from hats_import.margin_cache.margin_cache_resume_plan import MarginCachePlan # pylint: disable=too-many-locals,too-many-arguments def generate_margin_cache(args, client): """Generate a margin cache for a given input catalog. - The input catalog must be in hipscat format. + The input catalog must be in hats format. Args: args (MarginCacheArguments): A valid `MarginCacheArguments` object. @@ -63,17 +63,13 @@ def generate_margin_cache(args, client): partition_info = PartitionInfo.read_from_file(metadata_path) partition_info_file = paths.get_partition_info_pointer(args.catalog_path) partition_info.write_to_file(partition_info_file) - step_progress.update(1) - margin_catalog_info = args.to_catalog_info(int(total_rows)) - write_metadata.write_provenance_info( - catalog_base_dir=args.catalog_path, - dataset_info=margin_catalog_info, - tool_args=args.provenance_info(), - ) - write_metadata.write_catalog_info( - catalog_base_dir=args.catalog_path, dataset_info=margin_catalog_info + margin_catalog_info = args.to_table_properties( + int(total_rows), + partition_info.get_highest_order(), + partition_info.calculate_fractional_coverage(), ) + margin_catalog_info.to_properties_file(args.catalog_path) step_progress.update(1) file_io.remove_directory(args.tmp_path, ignore_errors=True) step_progress.update(1) diff --git a/src/hipscat_import/margin_cache/margin_cache_arguments.py b/src/hats_import/margin_cache/margin_cache_arguments.py similarity index 69% rename from src/hipscat_import/margin_cache/margin_cache_arguments.py rename to src/hats_import/margin_cache/margin_cache_arguments.py index e65f8542..05535942 100644 --- a/src/hipscat_import/margin_cache/margin_cache_arguments.py +++ b/src/hats_import/margin_cache/margin_cache_arguments.py @@ -4,14 +4,13 @@ from pathlib import Path from typing import List -import hipscat.pixel_math.healpix_shim as hp -from hipscat.catalog import Catalog -from hipscat.catalog.margin_cache.margin_cache_catalog_info import MarginCacheCatalogInfo -from hipscat.io.validation import is_valid_catalog -from hipscat.pixel_math.healpix_pixel import HealpixPixel +import hats.pixel_math.healpix_shim as hp +from hats.catalog import Catalog, TableProperties +from hats.io.validation import is_valid_catalog +from hats.pixel_math.healpix_pixel import HealpixPixel from upath import UPath -from hipscat_import.runtime_arguments import RuntimeArguments +from hats_import.runtime_arguments import RuntimeArguments @dataclass @@ -31,15 +30,9 @@ class MarginCacheArguments(RuntimeArguments): fine_filtering: bool = True """should we perform the precise boundary checking? if false, some results may be greater than `margin_threshold` away from the border (but within `margin_order`).""" - delete_intermediate_parquet_files: bool = True - """should we delete the smaller intermediate parquet files generated in the - splitting stage, once the relevant reducing stage is complete?""" - delete_resume_log_files: bool = True - """should we delete task-level done files once each stage is complete? - if False, we will keep all done marker files at the end of the pipeline.""" input_catalog_path: str | Path | UPath | None = None - """the path to the hipscat-formatted input catalog.""" + """the path to the hats-formatted input catalog.""" debug_filter_pixel_list: List[HealpixPixel] = field(default_factory=list) """debug setting. if provided, we will first filter the catalog to the pixels provided. this can be useful for creating a margin over a subset of a catalog.""" @@ -54,7 +47,7 @@ def _check_arguments(self): if not is_valid_catalog(self.input_catalog_path): raise ValueError("input_catalog_path not a valid catalog") - self.catalog = Catalog.read_from_hipscat(self.input_catalog_path) + self.catalog = Catalog.read_hats(self.input_catalog_path) if len(self.debug_filter_pixel_list) > 0: self.catalog = self.catalog.filter_from_pixel_list(self.debug_filter_pixel_list) if len(self.catalog.get_healpix_pixels()) == 0: @@ -76,24 +69,19 @@ def _check_arguments(self): if margin_pixel_mindist * 60.0 < self.margin_threshold: raise ValueError("margin pixels must be larger than margin_threshold") - def to_catalog_info(self, total_rows) -> MarginCacheCatalogInfo: + def to_table_properties( + self, total_rows: int, highest_order: int, moc_sky_fraction: float + ) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, "total_rows": total_rows, "catalog_type": "margin", - "epoch": self.catalog.catalog_info.epoch, "ra_column": self.catalog.catalog_info.ra_column, "dec_column": self.catalog.catalog_info.dec_column, - "primary_catalog": self.input_catalog_path, + "primary_catalog": str(self.input_catalog_path), "margin_threshold": self.margin_threshold, - } - return MarginCacheCatalogInfo(**info) - - def additional_runtime_provenance_info(self) -> dict: - return { - "input_catalog_path": self.input_catalog_path, - "margin_threshold": self.margin_threshold, - "margin_order": self.margin_order, - "debug_filter_pixel_list": self.debug_filter_pixel_list, - } + "hats_order": highest_order, + "moc_sky_fraction": f"{moc_sky_fraction:0.5f}", + } | self.extra_property_dict() + return TableProperties(**info) diff --git a/src/hipscat_import/margin_cache/margin_cache_map_reduce.py b/src/hats_import/margin_cache/margin_cache_map_reduce.py similarity index 81% rename from src/hipscat_import/margin_cache/margin_cache_map_reduce.py rename to src/hats_import/margin_cache/margin_cache_map_reduce.py index d63d705e..f6ac74a7 100644 --- a/src/hipscat_import/margin_cache/margin_cache_map_reduce.py +++ b/src/hats_import/margin_cache/margin_cache_map_reduce.py @@ -1,15 +1,14 @@ -import hipscat.pixel_math.healpix_shim as hp +import hats.pixel_math.healpix_shim as hp import numpy as np import pandas as pd import pyarrow as pa import pyarrow.dataset as ds -from hipscat import pixel_math -from hipscat.catalog.partition_info import PartitionInfo -from hipscat.io import file_io, paths -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats import pixel_math +from hats.io import file_io, paths +from hats.pixel_math.healpix_pixel import HealpixPixel -from hipscat_import.margin_cache.margin_cache_resume_plan import MarginCachePlan -from hipscat_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure +from hats_import.margin_cache.margin_cache_resume_plan import MarginCachePlan +from hats_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure # pylint: disable=too-many-arguments @@ -112,22 +111,22 @@ def _to_pixel_shard( shard_path = paths.pixel_catalog_file(partition_dir, source_pixel) rename_columns = { - PartitionInfo.METADATA_ORDER_COLUMN_NAME: f"margin_{PartitionInfo.METADATA_ORDER_COLUMN_NAME}", - PartitionInfo.METADATA_DIR_COLUMN_NAME: f"margin_{PartitionInfo.METADATA_DIR_COLUMN_NAME}", - PartitionInfo.METADATA_PIXEL_COLUMN_NAME: f"margin_{PartitionInfo.METADATA_PIXEL_COLUMN_NAME}", + paths.PARTITION_ORDER: paths.MARGIN_ORDER, + paths.PARTITION_DIR: paths.MARGIN_DIR, + paths.PARTITION_PIXEL: paths.MARGIN_PIXEL, } margin_data = margin_data.rename(columns=rename_columns) - margin_data[PartitionInfo.METADATA_ORDER_COLUMN_NAME] = pixel.order - margin_data[PartitionInfo.METADATA_DIR_COLUMN_NAME] = pixel.dir - margin_data[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] = pixel.pixel + margin_data[paths.PARTITION_ORDER] = pixel.order + margin_data[paths.PARTITION_DIR] = pixel.dir + margin_data[paths.PARTITION_PIXEL] = pixel.pixel margin_data = margin_data.astype( { - PartitionInfo.METADATA_ORDER_COLUMN_NAME: np.uint8, - PartitionInfo.METADATA_DIR_COLUMN_NAME: np.uint64, - PartitionInfo.METADATA_PIXEL_COLUMN_NAME: np.uint64, + paths.PARTITION_ORDER: np.uint8, + paths.PARTITION_DIR: np.uint64, + paths.PARTITION_PIXEL: np.uint64, } ) margin_data = margin_data.sort_index() @@ -152,9 +151,9 @@ def reduce_margin_shards( schema = file_io.read_parquet_metadata(original_catalog_metadata).schema.to_arrow_schema() schema = ( - schema.append(pa.field("margin_Norder", pa.uint8())) - .append(pa.field("margin_Dir", pa.uint64())) - .append(pa.field("margin_Npix", pa.uint64())) + schema.append(pa.field(paths.MARGIN_ORDER, pa.uint8())) + .append(pa.field(paths.MARGIN_DIR, pa.uint64())) + .append(pa.field(paths.MARGIN_PIXEL, pa.uint64())) ) data = ds.dataset(shard_dir, format="parquet", schema=schema) full_df = data.to_table().to_pandas() diff --git a/src/hipscat_import/margin_cache/margin_cache_resume_plan.py b/src/hats_import/margin_cache/margin_cache_resume_plan.py similarity index 90% rename from src/hipscat_import/margin_cache/margin_cache_resume_plan.py rename to src/hats_import/margin_cache/margin_cache_resume_plan.py index 000e1ae2..f1239d23 100644 --- a/src/hipscat_import/margin_cache/margin_cache_resume_plan.py +++ b/src/hats_import/margin_cache/margin_cache_resume_plan.py @@ -6,12 +6,12 @@ from typing import List import pandas as pd -from hipscat import pixel_math -from hipscat.io import file_io -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats import pixel_math +from hats.io import file_io +from hats.pixel_math.healpix_pixel import HealpixPixel -from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments -from hipscat_import.pipeline_resume_plan import PipelineResumePlan +from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments +from hats_import.pipeline_resume_plan import PipelineResumePlan @dataclass @@ -29,15 +29,7 @@ class MarginCachePlan(PipelineResumePlan): def __init__(self, args: MarginCacheArguments): if not args.tmp_path: # pragma: no cover (not reachable, but required for mypy) raise ValueError("tmp_path is required") - super().__init__( - resume=args.resume, - progress_bar=args.progress_bar, - simple_progress_bar=args.simple_progress_bar, - tmp_path=args.tmp_path, - tmp_base_path=args.tmp_base_path, - delete_resume_log_files=args.delete_resume_log_files, - delete_intermediate_parquet_files=args.delete_intermediate_parquet_files, - ) + super().__init__(**args.resume_kwargs_dict()) self._gather_plan(args) def _gather_plan(self, args): diff --git a/src/hipscat_import/pipeline.py b/src/hats_import/pipeline.py similarity index 75% rename from src/hipscat_import/pipeline.py rename to src/hats_import/pipeline.py index dd4487d5..f696b4be 100644 --- a/src/hipscat_import/pipeline.py +++ b/src/hats_import/pipeline.py @@ -5,17 +5,17 @@ from dask.distributed import Client -import hipscat_import.catalog.run_import as catalog_runner -import hipscat_import.index.run_index as index_runner -import hipscat_import.margin_cache.margin_cache as margin_runner -import hipscat_import.soap.run_soap as soap_runner -import hipscat_import.verification.run_verification as verification_runner -from hipscat_import.catalog.arguments import ImportArguments -from hipscat_import.index.arguments import IndexArguments -from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments -from hipscat_import.runtime_arguments import RuntimeArguments -from hipscat_import.soap.arguments import SoapArguments -from hipscat_import.verification.arguments import VerificationArguments +import hats_import.catalog.run_import as catalog_runner +import hats_import.index.run_index as index_runner +import hats_import.margin_cache.margin_cache as margin_runner +import hats_import.soap.run_soap as soap_runner +import hats_import.verification.run_verification as verification_runner +from hats_import.catalog.arguments import ImportArguments +from hats_import.index.arguments import IndexArguments +from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments +from hats_import.runtime_arguments import RuntimeArguments +from hats_import.soap.arguments import SoapArguments +from hats_import.verification.arguments import VerificationArguments # pragma: no cover @@ -62,7 +62,7 @@ def pipeline_with_client(args: RuntimeArguments, client: Client): def _send_failure_email(args: RuntimeArguments, exception: Exception): message = EmailMessage() - message["Subject"] = "hipscat-import failure." + message["Subject"] = "hats-import failure." message["To"] = args.completion_email_address message.set_content( f"output_artifact_name: {args.output_artifact_name}" @@ -77,7 +77,7 @@ def _send_success_email(args): if not args.completion_email_address: return message = EmailMessage() - message["Subject"] = "hipscat-import success." + message["Subject"] = "hats-import success." message["To"] = args.completion_email_address message.set_content(f"output_artifact_name: {args.output_artifact_name}") diff --git a/src/hipscat_import/pipeline_resume_plan.py b/src/hats_import/pipeline_resume_plan.py similarity index 99% rename from src/hipscat_import/pipeline_resume_plan.py rename to src/hats_import/pipeline_resume_plan.py index 95a966af..9e49b15f 100644 --- a/src/hipscat_import/pipeline_resume_plan.py +++ b/src/hats_import/pipeline_resume_plan.py @@ -8,8 +8,8 @@ from dask.distributed import as_completed, get_worker from dask.distributed import print as dask_print -from hipscat.io import file_io -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats.io import file_io +from hats.pixel_math.healpix_pixel import HealpixPixel from tqdm.auto import tqdm as auto_tqdm from tqdm.std import tqdm as std_tqdm from upath import UPath @@ -218,7 +218,7 @@ def get_pixel_cache_directory(cache_path, pixel: HealpixPixel): """Create a path for intermediate pixel data. You can use this over the paths.get_pixel_directory method, as it will include the pixel - number in the path. Further, it will just *look* different from a real hipscat + number in the path. Further, it will just *look* different from a real hats path, so it's clearer that it's a temporary directory:: {cache_path}/order_{order}/dir_{dir}/pixel_{pixel}/ diff --git a/src/hipscat_import/py.typed b/src/hats_import/py.typed similarity index 100% rename from src/hipscat_import/py.typed rename to src/hats_import/py.typed diff --git a/src/hipscat_import/runtime_arguments.py b/src/hats_import/runtime_arguments.py similarity index 73% rename from src/hipscat_import/runtime_arguments.py rename to src/hats_import/runtime_arguments.py index eae0ad89..cae7322e 100644 --- a/src/hipscat_import/runtime_arguments.py +++ b/src/hats_import/runtime_arguments.py @@ -4,10 +4,11 @@ import re from dataclasses import dataclass +from datetime import datetime, timezone from importlib.metadata import version from pathlib import Path -from hipscat.io import file_io +from hats.io import file_io from upath import UPath # pylint: disable=too-many-instance-attributes @@ -22,6 +23,11 @@ class RuntimeArguments: """base path where new catalog should be output""" output_artifact_name: str = "" """short, convenient name for the catalog""" + addl_hats_properties: dict | None = None + """Any additional keyword arguments you would like to provide when writing + the `properties` file for the final HATS table. e.g. + {"hats_cols_default":"id, mjd", "hats_cols_survey_id":"unique_id", + "creator_did": "ivo://CDS/P/2MASS/J"}""" ## Execution tmp_dir: str | Path | UPath | None = None @@ -46,6 +52,12 @@ class RuntimeArguments: """number of threads per dask worker""" resume_tmp: str | Path | UPath | None = None """directory for intermediate resume files, when needed. see RTD for more info.""" + delete_intermediate_parquet_files: bool = True + """should we delete the smaller intermediate parquet files generated in the + splitting stage, once the relevant reducing stage is complete?""" + delete_resume_log_files: bool = True + """should we delete task-level done files once each stage is complete? + if False, we will keep all done marker files at the end of the pipeline.""" completion_email_address: str = "" """if provided, send an email to the indicated email address once the @@ -103,37 +115,34 @@ def _check_arguments(self): else: self.resume_tmp = self.tmp_path - def provenance_info(self) -> dict: - """Fill all known information in a dictionary for provenance tracking. - - Returns: - dictionary with all argument_name -> argument_value as key -> value pairs. - """ - runtime_args = { - "catalog_name": self.output_artifact_name, - "output_path": self.output_path, - "output_artifact_name": self.output_artifact_name, - "tmp_dir": self.tmp_dir, - "dask_tmp": self.dask_tmp, - "dask_n_workers": self.dask_n_workers, - "dask_threads_per_worker": self.dask_threads_per_worker, - "catalog_path": self.catalog_path, - "tmp_path": self.tmp_path, + def extra_property_dict(self): + """Generate additional HATS properties for this import run as a dictionary.""" + properties = {} + + properties["hats_builder"] = f"hats-import v{version('hats-import')}" + + now = datetime.now(tz=timezone.utc) + properties["hats_creation_date"] = now.strftime("%Y-%m-%dT%H:%M%Z") + properties["hats_estsize"] = int(_estimate_dir_size(self.catalog_path) / 1024) + properties["hats_release_date"] = "2024-09-18" + properties["hats_version"] = "v0.1" + + if self.addl_hats_properties: + properties = properties | self.addl_hats_properties + return properties + + def resume_kwargs_dict(self): + """Convenience method to convert fields for resume functionality.""" + return { + "resume": self.resume, + "progress_bar": self.progress_bar, + "simple_progress_bar": self.simple_progress_bar, + "tmp_path": self.resume_tmp, + "tmp_base_path": self.tmp_base_path, + "delete_resume_log_files": self.delete_resume_log_files, + "delete_intermediate_parquet_files": self.delete_intermediate_parquet_files, } - runtime_args.update(self.additional_runtime_provenance_info()) - provenance_info = { - "tool_name": "hipscat_import", - "version": version("hipscat-import"), - "runtime_args": runtime_args, - } - - return provenance_info - - def additional_runtime_provenance_info(self): - """Any additional runtime args to be included in provenance info from subclasses""" - return {} - def find_input_paths(input_path="", file_matcher="", input_file_list=None): """Helper method to find input paths, given either a prefix and format, or an @@ -166,3 +175,13 @@ def find_input_paths(input_path="", file_matcher="", input_file_list=None): if len(input_paths) == 0: raise FileNotFoundError("No input files found") return input_paths + + +def _estimate_dir_size(target_dir): + total_size = 0 + for item in target_dir.iterdir(): + if item.is_dir(): + total_size += _estimate_dir_size(item) + else: + total_size += item.stat().st_size + return total_size diff --git a/src/hipscat_import/soap/__init__.py b/src/hats_import/soap/__init__.py similarity index 100% rename from src/hipscat_import/soap/__init__.py rename to src/hats_import/soap/__init__.py diff --git a/src/hipscat_import/soap/arguments.py b/src/hats_import/soap/arguments.py similarity index 54% rename from src/hipscat_import/soap/arguments.py rename to src/hats_import/soap/arguments.py index 2cfe5fe1..1c11c8b6 100644 --- a/src/hipscat_import/soap/arguments.py +++ b/src/hats_import/soap/arguments.py @@ -3,13 +3,12 @@ from dataclasses import dataclass from pathlib import Path -from hipscat.catalog import Catalog -from hipscat.catalog.association_catalog.association_catalog import AssociationCatalogInfo -from hipscat.catalog.catalog_type import CatalogType -from hipscat.io.validation import is_valid_catalog +from hats.catalog import Catalog, TableProperties +from hats.catalog.catalog_type import CatalogType +from hats.io.validation import is_valid_catalog from upath import UPath -from hipscat_import.runtime_arguments import RuntimeArguments +from hats_import.runtime_arguments import RuntimeArguments @dataclass @@ -25,18 +24,9 @@ class SoapArguments(RuntimeArguments): source_object_id_column: str = "" source_id_column: str = "" - resume: bool = True - """if there are existing intermediate resume files, should we - read those and continue to run the pipeline where we left off""" - delete_resume_log_files: bool = True - """should we delete task-level done files once each stage is complete? - if False, we will keep all done marker files at the end of the pipeline.""" write_leaf_files: bool = False """Should we also write out leaf parquet files (e.g. Norder/Dir/Npix.parquet) that represent the full association table""" - delete_intermediate_parquet_files: bool = True - """should we delete the smaller intermediate parquet files generated in the - mapping stage, once the relevant reducing stage is complete?""" compute_partition_size: int = 1_000_000_000 @@ -52,7 +42,7 @@ def _check_arguments(self): if not is_valid_catalog(self.object_catalog_dir): raise ValueError("object_catalog_dir not a valid catalog") - self.object_catalog = Catalog.read_from_hipscat(catalog_path=self.object_catalog_dir) + self.object_catalog = Catalog.read_hats(catalog_path=self.object_catalog_dir) if not self.source_catalog_dir: raise ValueError("source_catalog_dir is required") @@ -61,12 +51,12 @@ def _check_arguments(self): if not is_valid_catalog(self.source_catalog_dir): raise ValueError("source_catalog_dir not a valid catalog") - self.source_catalog = Catalog.read_from_hipscat(catalog_path=self.source_catalog_dir) + self.source_catalog = Catalog.read_hats(catalog_path=self.source_catalog_dir) if self.compute_partition_size < 100_000: raise ValueError("compute_partition_size must be at least 100_000") - def to_catalog_info(self, total_rows) -> AssociationCatalogInfo: + def to_table_properties(self, total_rows=10, highest_order=4, moc_sky_fraction=22 / 7) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, @@ -74,21 +64,12 @@ def to_catalog_info(self, total_rows) -> AssociationCatalogInfo: "total_rows": total_rows, "primary_column": self.object_id_column, "primary_column_association": "object_id", - "primary_catalog": self.object_catalog_dir, + "primary_catalog": str(self.object_catalog_dir), "join_column": self.source_object_id_column, "join_column_association": "source_id", - "join_catalog": self.source_catalog_dir, + "join_catalog": str(self.source_catalog_dir), "contains_leaf_files": self.write_leaf_files, - } - return AssociationCatalogInfo(**info) - - def additional_runtime_provenance_info(self) -> dict: - return { - "object_catalog_dir": self.object_catalog_dir, - "object_id_column": self.object_id_column, - "source_catalog_dir": self.source_catalog_dir, - "source_object_id_column": self.source_object_id_column, - "source_id_column": self.source_id_column, - "compute_partition_size": self.compute_partition_size, - "write_leaf_files": self.write_leaf_files, - } + "hats_order": highest_order, + "moc_sky_fraction": f"{moc_sky_fraction:0.5f}", + } | self.extra_property_dict() + return TableProperties(**info) diff --git a/src/hipscat_import/soap/map_reduce.py b/src/hats_import/soap/map_reduce.py similarity index 93% rename from src/hipscat_import/soap/map_reduce.py rename to src/hats_import/soap/map_reduce.py index 009d921e..44d8c612 100644 --- a/src/hipscat_import/soap/map_reduce.py +++ b/src/hats_import/soap/map_reduce.py @@ -5,15 +5,15 @@ import numpy as np import pandas as pd import pyarrow.parquet as pq -from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo -from hipscat.io import file_io, paths -from hipscat.io.parquet_metadata import get_healpix_pixel_from_metadata -from hipscat.pixel_math.healpix_pixel import HealpixPixel -from hipscat.pixel_math.healpix_pixel_function import get_pixel_argsort +from hats.catalog.association_catalog.partition_join_info import PartitionJoinInfo +from hats.io import file_io, paths +from hats.io.parquet_metadata import get_healpix_pixel_from_metadata +from hats.pixel_math.healpix_pixel import HealpixPixel +from hats.pixel_math.healpix_pixel_function import get_pixel_argsort -from hipscat_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure -from hipscat_import.soap.arguments import SoapArguments -from hipscat_import.soap.resume_plan import SoapPlan +from hats_import.pipeline_resume_plan import get_pixel_cache_directory, print_task_failure +from hats_import.soap.arguments import SoapArguments +from hats_import.soap.resume_plan import SoapPlan def _count_joins_for_object(source_data, source_pixel, object_pixel, soap_args): @@ -84,7 +84,7 @@ def count_joins(soap_args: SoapArguments, source_pixel: HealpixPixel, object_pix If any un-joined source pixels remain, stretch out to neighboring object pixels. Args: - soap_args(`hipscat_import.soap.SoapArguments`): set of arguments for pipeline execution + soap_args(`hats_import.soap.SoapArguments`): set of arguments for pipeline execution source_pixel(HealpixPixel): order and pixel for the source catalog single pixel. object_pixels(List[HealpixPixel]): set of tuples of order and pixel for the partitions of the object catalog to be joined. diff --git a/src/hipscat_import/soap/resume_plan.py b/src/hats_import/soap/resume_plan.py similarity index 90% rename from src/hipscat_import/soap/resume_plan.py rename to src/hats_import/soap/resume_plan.py index a77eb25d..abb04f58 100644 --- a/src/hipscat_import/soap/resume_plan.py +++ b/src/hats_import/soap/resume_plan.py @@ -5,15 +5,15 @@ from dataclasses import dataclass, field from typing import List, Optional, Tuple -import hipscat.pixel_math.healpix_shim as hp +import hats.pixel_math.healpix_shim as hp import numpy as np -from hipscat.catalog import Catalog -from hipscat.io import file_io -from hipscat.pixel_math.healpix_pixel import HealpixPixel -from hipscat.pixel_tree import PixelAlignment, align_trees +from hats.catalog import Catalog +from hats.io import file_io +from hats.pixel_math.healpix_pixel import HealpixPixel +from hats.pixel_tree import PixelAlignment, align_trees -from hipscat_import.pipeline_resume_plan import PipelineResumePlan -from hipscat_import.soap.arguments import SoapArguments +from hats_import.pipeline_resume_plan import PipelineResumePlan +from hats_import.soap.arguments import SoapArguments @dataclass @@ -35,15 +35,7 @@ class SoapPlan(PipelineResumePlan): def __init__(self, args: SoapArguments): if not args.tmp_path: # pragma: no cover (not reachable, but required for mypy) raise ValueError("tmp_path is required") - super().__init__( - resume=args.resume, - progress_bar=args.progress_bar, - simple_progress_bar=args.simple_progress_bar, - tmp_path=args.tmp_path, - tmp_base_path=args.tmp_base_path, - delete_resume_log_files=args.delete_resume_log_files, - delete_intermediate_parquet_files=args.delete_intermediate_parquet_files, - ) + super().__init__(**args.resume_kwargs_dict()) self.gather_plan(args) def gather_plan(self, args): @@ -59,12 +51,12 @@ def gather_plan(self, args): return step_progress.update(1) - self.object_catalog = Catalog.read_from_hipscat(args.object_catalog_dir) + self.object_catalog = Catalog.read_hats(args.object_catalog_dir) source_map_file = file_io.append_paths_to_pointer(self.tmp_path, self.SOURCE_MAP_FILE) if file_io.does_file_or_directory_exist(source_map_file): source_pixel_map = np.load(source_map_file, allow_pickle=True)["arr_0"].item() else: - source_catalog = Catalog.read_from_hipscat(args.source_catalog_dir) + source_catalog = Catalog.read_hats(args.source_catalog_dir) source_pixel_map = source_to_object_map(self.object_catalog, source_catalog) np.savez_compressed(source_map_file, source_pixel_map) self.count_keys = self.get_sources_to_count(source_pixel_map=source_pixel_map) diff --git a/src/hipscat_import/soap/run_soap.py b/src/hats_import/soap/run_soap.py similarity index 76% rename from src/hipscat_import/soap/run_soap.py rename to src/hats_import/soap/run_soap.py index d5f7a0cf..8bd59459 100644 --- a/src/hipscat_import/soap/run_soap.py +++ b/src/hats_import/soap/run_soap.py @@ -3,12 +3,12 @@ The actual logic of the map reduce is in the `map_reduce.py` file. """ -from hipscat.catalog.association_catalog.partition_join_info import PartitionJoinInfo -from hipscat.io import parquet_metadata, paths, write_metadata +from hats.catalog import PartitionInfo, PartitionJoinInfo +from hats.io import parquet_metadata, paths -from hipscat_import.soap.arguments import SoapArguments -from hipscat_import.soap.map_reduce import combine_partial_results, count_joins, reduce_joins -from hipscat_import.soap.resume_plan import SoapPlan +from hats_import.soap.arguments import SoapArguments +from hats_import.soap.map_reduce import combine_partial_results, count_joins, reduce_joins +from hats_import.soap.resume_plan import SoapPlan def run(args, client): @@ -57,14 +57,13 @@ def run(args, client): else: total_rows = combine_partial_results(args.tmp_path, args.catalog_path) step_progress.update(1) - catalog_info = args.to_catalog_info(total_rows) - write_metadata.write_provenance_info( - catalog_base_dir=args.catalog_path, - dataset_info=catalog_info, - tool_args=args.provenance_info(), + partition_info = PartitionInfo.read_from_dir(args.catalog_path) + catalog_info = args.to_table_properties( + total_rows, partition_info.get_highest_order(), partition_info.calculate_fractional_coverage() ) + catalog_info.to_properties_file(args.catalog_path) step_progress.update(1) - write_metadata.write_catalog_info(dataset_info=catalog_info, catalog_base_dir=args.catalog_path) + ## TODO - optionally write out arguments file step_progress.update(1) resume_plan.clean_resume_files() step_progress.update(1) diff --git a/src/hipscat_import/verification/__init__.py b/src/hats_import/verification/__init__.py similarity index 100% rename from src/hipscat_import/verification/__init__.py rename to src/hats_import/verification/__init__.py diff --git a/src/hipscat_import/verification/arguments.py b/src/hats_import/verification/arguments.py similarity index 74% rename from src/hipscat_import/verification/arguments.py rename to src/hats_import/verification/arguments.py index 86c139b1..d17a30ed 100644 --- a/src/hipscat_import/verification/arguments.py +++ b/src/hats_import/verification/arguments.py @@ -6,11 +6,11 @@ from pathlib import Path from typing import List, Optional -from hipscat.catalog import Catalog -from hipscat.io.validation import is_valid_catalog +from hats.catalog import Catalog +from hats.io.validation import is_valid_catalog from upath import UPath -from hipscat_import.runtime_arguments import RuntimeArguments +from hats_import.runtime_arguments import RuntimeArguments @dataclass @@ -39,13 +39,6 @@ def _check_arguments(self): if not self.input_catalog: if not is_valid_catalog(self.input_catalog_path): raise ValueError("input_catalog_path not a valid catalog") - self.input_catalog = Catalog.read_from_hipscat(catalog_path=self.input_catalog_path) + self.input_catalog = Catalog.read_hats(catalog_path=self.input_catalog_path) if not self.input_catalog_path: self.input_catalog_path = self.input_catalog.catalog_path - - def additional_runtime_provenance_info(self) -> dict: - return { - "pipeline": "verification pipeline", - "input_catalog_path": self.input_catalog_path, - "field_distribution_cols": self.field_distribution_cols, - } diff --git a/src/hipscat_import/verification/run_verification.py b/src/hats_import/verification/run_verification.py similarity index 82% rename from src/hipscat_import/verification/run_verification.py rename to src/hats_import/verification/run_verification.py index 2b7d5954..ea623ddf 100644 --- a/src/hipscat_import/verification/run_verification.py +++ b/src/hats_import/verification/run_verification.py @@ -1,6 +1,6 @@ -"""Run pass/fail checks and generate verification report of existing hipscat table.""" +"""Run pass/fail checks and generate verification report of existing hats table.""" -from hipscat_import.verification.arguments import VerificationArguments +from hats_import.verification.arguments import VerificationArguments def run(args): diff --git a/tests/conftest.py b/tests/conftest.py index 7666200f..2d9f12ff 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -39,7 +39,7 @@ def dask_client(use_ray): def pytest_addoption(parser): """Add command line option to test dask unit tests on ray. - This must live in /tests/conftest.py (not /tests/hipscat-import/conftest.py)""" + This must live in /tests/conftest.py (not /tests/hats-import/conftest.py)""" parser.addoption( "--use_ray", action="store_true", diff --git a/tests/hipscat_import/data/blank/blank.csv b/tests/data/blank/blank.csv similarity index 100% rename from tests/hipscat_import/data/blank/blank.csv rename to tests/data/blank/blank.csv diff --git a/tests/data/generate_data.ipynb b/tests/data/generate_data.ipynb new file mode 100644 index 00000000..d3ba24c9 --- /dev/null +++ b/tests/data/generate_data.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Unit test data\n", + "\n", + "This directory contains very small, toy, data sets that are used\n", + "for unit tests.\n", + "\n", + "## Object catalog: small_sky\n", + "\n", + "This \"object catalog\" is 131 randomly generated radec values. \n", + "\n", + "- All radec positions are in the Healpix pixel order 0, pixel 11.\n", + "- IDs are integers from 700-831.\n", + "\n", + "The following are imports and paths that are used throughout the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import hats_import.pipeline as runner\n", + "from hats_import.catalog.arguments import ImportArguments\n", + "import tempfile\n", + "from pathlib import Path\n", + "from dask.distributed import Client\n", + "\n", + "tmp_path = tempfile.TemporaryDirectory()\n", + "tmp_dir = tmp_path.name\n", + "\n", + "hats_import_dir = \".\"\n", + "client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### small_sky\n", + "\n", + "This \"object catalog\" is 131 randomly generated radec values. \n", + "\n", + "- All radec positions are in the Healpix pixel order 0, pixel 11.\n", + "- IDs are integers from 700-831.\n", + "\n", + "This catalog was generated with the following snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with tempfile.TemporaryDirectory() as pipeline_tmp:\n", + " args = ImportArguments(\n", + " input_path=Path(hats_import_dir) / \"small_sky\",\n", + " output_path=\".\",\n", + " file_reader=\"csv\",\n", + " highest_healpix_order=5,\n", + " output_artifact_name=\"small_sky_object_catalog\",\n", + " tmp_dir=pipeline_tmp,\n", + " )\n", + " runner.pipeline_with_client(args, client)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Source catalog: small_sky_source\n", + "\n", + "This \"source catalog\" is 131 detections at each of the 131 objects\n", + "in the \"small_sky\" catalog. These have a random magnitude, MJD, and \n", + "band (selected from ugrizy). The full script that generated the values\n", + "can be found [here](https://github.com/delucchi-cmu/hipscripts/blob/main/twiddling/small_sky_source.py)\n", + "\n", + "The catalog was generated with the following snippet, using raw data \n", + "from the `hats-import` file.\n", + "\n", + "NB: `pixel_threshold=3000` is set just to make sure that we're generating\n", + "a handful of files at various healpix orders." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with tempfile.TemporaryDirectory() as pipeline_tmp:\n", + " args = ImportArguments(\n", + " input_path=Path(hats_import_dir) / \"small_sky_source\",\n", + " output_path=\".\",\n", + " file_reader=\"csv\",\n", + " ra_column=\"source_ra\",\n", + " dec_column=\"source_dec\",\n", + " catalog_type=\"source\",\n", + " highest_healpix_order=5,\n", + " pixel_threshold=3000,\n", + " output_artifact_name=\"small_sky_source_catalog\",\n", + " tmp_dir=pipeline_tmp,\n", + " )\n", + " runner.pipeline_with_client(args, client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.close()\n", + "tmp_path.cleanup()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/hipscat_import/data/small_sky_object_catalog/Norder=0/Dir=0/Npix=11.parquet b/tests/data/hipscat/small_sky_object_catalog/Norder=0/Dir=0/Npix=11.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_object_catalog/Norder=0/Dir=0/Npix=11.parquet rename to tests/data/hipscat/small_sky_object_catalog/Norder=0/Dir=0/Npix=11.parquet diff --git a/tests/hipscat_import/data/small_sky_object_catalog/_common_metadata b/tests/data/hipscat/small_sky_object_catalog/_common_metadata similarity index 100% rename from tests/hipscat_import/data/small_sky_object_catalog/_common_metadata rename to tests/data/hipscat/small_sky_object_catalog/_common_metadata diff --git a/tests/hipscat_import/data/small_sky_object_catalog/_metadata b/tests/data/hipscat/small_sky_object_catalog/_metadata similarity index 100% rename from tests/hipscat_import/data/small_sky_object_catalog/_metadata rename to tests/data/hipscat/small_sky_object_catalog/_metadata diff --git a/tests/hipscat_import/data/small_sky_object_catalog/catalog_info.json b/tests/data/hipscat/small_sky_object_catalog/catalog_info.json similarity index 100% rename from tests/hipscat_import/data/small_sky_object_catalog/catalog_info.json rename to tests/data/hipscat/small_sky_object_catalog/catalog_info.json diff --git a/tests/hipscat_import/data/small_sky_object_catalog/partition_info.csv b/tests/data/hipscat/small_sky_object_catalog/partition_info.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_object_catalog/partition_info.csv rename to tests/data/hipscat/small_sky_object_catalog/partition_info.csv diff --git a/tests/hipscat_import/data/small_sky_object_catalog/provenance_info.json b/tests/data/hipscat/small_sky_object_catalog/provenance_info.json similarity index 100% rename from tests/hipscat_import/data/small_sky_object_catalog/provenance_info.json rename to tests/data/hipscat/small_sky_object_catalog/provenance_info.json diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=0/Dir=0/Npix=4.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=0/Dir=0/Npix=4.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=0/Dir=0/Npix=4.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=0/Dir=0/Npix=4.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=1/Dir=0/Npix=47.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=1/Dir=0/Npix=47.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=1/Dir=0/Npix=47.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=1/Dir=0/Npix=47.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=176.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=176.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=176.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=176.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=177.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=177.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=177.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=177.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=178.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=178.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=178.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=178.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=179.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=179.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=179.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=179.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=180.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=180.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=180.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=180.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=181.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=181.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=181.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=181.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=182.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=182.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=182.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=182.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=183.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=183.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=183.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=183.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=184.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=184.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=184.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=184.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=185.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=185.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=185.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=185.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=186.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=186.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=186.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=186.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=187.parquet b/tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=187.parquet similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/Norder=2/Dir=0/Npix=187.parquet rename to tests/data/hipscat/small_sky_source_catalog/Norder=2/Dir=0/Npix=187.parquet diff --git a/tests/hipscat_import/data/small_sky_source_catalog/_common_metadata b/tests/data/hipscat/small_sky_source_catalog/_common_metadata similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/_common_metadata rename to tests/data/hipscat/small_sky_source_catalog/_common_metadata diff --git a/tests/hipscat_import/data/small_sky_source_catalog/_metadata b/tests/data/hipscat/small_sky_source_catalog/_metadata similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/_metadata rename to tests/data/hipscat/small_sky_source_catalog/_metadata diff --git a/tests/hipscat_import/data/small_sky_source_catalog/catalog_info.json b/tests/data/hipscat/small_sky_source_catalog/catalog_info.json similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/catalog_info.json rename to tests/data/hipscat/small_sky_source_catalog/catalog_info.json diff --git a/tests/hipscat_import/data/small_sky_source_catalog/partition_info.csv b/tests/data/hipscat/small_sky_source_catalog/partition_info.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/partition_info.csv rename to tests/data/hipscat/small_sky_source_catalog/partition_info.csv diff --git a/tests/hipscat_import/data/small_sky_source_catalog/point_map.fits b/tests/data/hipscat/small_sky_source_catalog/point_map.fits similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/point_map.fits rename to tests/data/hipscat/small_sky_source_catalog/point_map.fits diff --git a/tests/hipscat_import/data/small_sky_source_catalog/provenance_info.json b/tests/data/hipscat/small_sky_source_catalog/provenance_info.json similarity index 100% rename from tests/hipscat_import/data/small_sky_source_catalog/provenance_info.json rename to tests/data/hipscat/small_sky_source_catalog/provenance_info.json diff --git a/tests/data/indexed_files/csv_list_double_1_of_2.txt b/tests/data/indexed_files/csv_list_double_1_of_2.txt new file mode 100644 index 00000000..a30f60be --- /dev/null +++ b/tests/data/indexed_files/csv_list_double_1_of_2.txt @@ -0,0 +1,3 @@ +tests/data/small_sky_parts/catalog_00_of_05.csv +tests/data/small_sky_parts/catalog_01_of_05.csv + diff --git a/tests/data/indexed_files/csv_list_double_2_of_2.txt b/tests/data/indexed_files/csv_list_double_2_of_2.txt new file mode 100644 index 00000000..bb12c6db --- /dev/null +++ b/tests/data/indexed_files/csv_list_double_2_of_2.txt @@ -0,0 +1,3 @@ +tests/data/small_sky_parts/catalog_02_of_05.csv +tests/data/small_sky_parts/catalog_03_of_05.csv +tests/data/small_sky_parts/catalog_04_of_05.csv \ No newline at end of file diff --git a/tests/data/indexed_files/csv_list_single.txt b/tests/data/indexed_files/csv_list_single.txt new file mode 100644 index 00000000..0d98af84 --- /dev/null +++ b/tests/data/indexed_files/csv_list_single.txt @@ -0,0 +1,6 @@ +tests/data/small_sky_parts/catalog_00_of_05.csv +tests/data/small_sky_parts/catalog_01_of_05.csv +tests/data/small_sky_parts/catalog_02_of_05.csv +tests/data/small_sky_parts/catalog_03_of_05.csv +tests/data/small_sky_parts/catalog_04_of_05.csv + diff --git a/tests/data/indexed_files/parquet_list_single.txt b/tests/data/indexed_files/parquet_list_single.txt new file mode 100644 index 00000000..77f8c852 --- /dev/null +++ b/tests/data/indexed_files/parquet_list_single.txt @@ -0,0 +1,5 @@ +tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_0_0.parquet +tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_1_0.parquet +tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_2_0.parquet +tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_3_0.parquet +tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_4_0.parquet diff --git a/tests/hipscat_import/data/margin_pairs/negative_pairs.csv b/tests/data/margin_pairs/negative_pairs.csv similarity index 100% rename from tests/hipscat_import/data/margin_pairs/negative_pairs.csv rename to tests/data/margin_pairs/negative_pairs.csv diff --git a/tests/hipscat_import/data/margin_pairs/small_sky_source_pairs.csv b/tests/data/margin_pairs/small_sky_source_pairs.csv similarity index 100% rename from tests/hipscat_import/data/margin_pairs/small_sky_source_pairs.csv rename to tests/data/margin_pairs/small_sky_source_pairs.csv diff --git a/tests/hipscat_import/data/mixed_schema/input_01.csv b/tests/data/mixed_schema/input_01.csv similarity index 100% rename from tests/hipscat_import/data/mixed_schema/input_01.csv rename to tests/data/mixed_schema/input_01.csv diff --git a/tests/hipscat_import/data/mixed_schema/input_02.csv b/tests/data/mixed_schema/input_02.csv similarity index 100% rename from tests/hipscat_import/data/mixed_schema/input_02.csv rename to tests/data/mixed_schema/input_02.csv diff --git a/tests/hipscat_import/data/mixed_schema/schema.parquet b/tests/data/mixed_schema/schema.parquet similarity index 100% rename from tests/hipscat_import/data/mixed_schema/schema.parquet rename to tests/data/mixed_schema/schema.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_0_0.parquet b/tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_0_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_0_0.parquet rename to tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_0_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_1_0.parquet b/tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_1_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_1_0.parquet rename to tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_1_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_2_0.parquet b/tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_2_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_2_0.parquet rename to tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_2_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_3_0.parquet b/tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_3_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_3_0.parquet rename to tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_3_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_4_0.parquet b/tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_4_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_4_0.parquet rename to tests/data/parquet_shards/order_0/dir_0/pixel_11/shard_4_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_0_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_0_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_0_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_0_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_1_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_1_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_1_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_1_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_2_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_2_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_2_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_2_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_3_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_3_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_3_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_3_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_4_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_4_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_44/shard_4_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_44/shard_4_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_0_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_0_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_0_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_0_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_1_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_1_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_1_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_1_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_2_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_2_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_2_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_2_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_3_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_3_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_3_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_3_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_4_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_4_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_45/shard_4_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_45/shard_4_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_0_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_0_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_0_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_0_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_1_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_1_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_1_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_1_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_2_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_2_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_2_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_2_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_3_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_3_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_3_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_3_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_4_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_4_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_46/shard_4_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_46/shard_4_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_0_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_0_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_0_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_0_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_1_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_1_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_1_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_1_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_2_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_2_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_2_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_2_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_3_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_3_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_3_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_3_0.parquet diff --git a/tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_4_0.parquet b/tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_4_0.parquet similarity index 100% rename from tests/hipscat_import/data/parquet_shards/order_1/dir_0/pixel_47/shard_4_0.parquet rename to tests/data/parquet_shards/order_1/dir_0/pixel_47/shard_4_0.parquet diff --git a/tests/hipscat_import/data/resume/Norder=0/Dir=0/Npix=11.parquet b/tests/data/resume/Norder=0/Dir=0/Npix=11.parquet similarity index 100% rename from tests/hipscat_import/data/resume/Norder=0/Dir=0/Npix=11.parquet rename to tests/data/resume/Norder=0/Dir=0/Npix=11.parquet diff --git a/tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=44.parquet b/tests/data/resume/Norder=1/Dir=0/Npix=44.parquet similarity index 100% rename from tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=44.parquet rename to tests/data/resume/Norder=1/Dir=0/Npix=44.parquet diff --git a/tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=45.parquet b/tests/data/resume/Norder=1/Dir=0/Npix=45.parquet similarity index 100% rename from tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=45.parquet rename to tests/data/resume/Norder=1/Dir=0/Npix=45.parquet diff --git a/tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=46.parquet b/tests/data/resume/Norder=1/Dir=0/Npix=46.parquet similarity index 100% rename from tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=46.parquet rename to tests/data/resume/Norder=1/Dir=0/Npix=46.parquet diff --git a/tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=47.parquet b/tests/data/resume/Norder=1/Dir=0/Npix=47.parquet similarity index 100% rename from tests/hipscat_import/data/resume/Norder=1/Dir=0/Npix=47.parquet rename to tests/data/resume/Norder=1/Dir=0/Npix=47.parquet diff --git a/tests/hipscat_import/data/resume/intermediate/mapping_histogram.binary b/tests/data/resume/intermediate/mapping_histogram.binary similarity index 100% rename from tests/hipscat_import/data/resume/intermediate/mapping_histogram.binary rename to tests/data/resume/intermediate/mapping_histogram.binary diff --git a/tests/hipscat_import/data/small_sky/catalog.csv b/tests/data/small_sky/catalog.csv similarity index 100% rename from tests/hipscat_import/data/small_sky/catalog.csv rename to tests/data/small_sky/catalog.csv diff --git a/tests/data/small_sky_object_catalog/dataset/Norder=0/Dir=0/Npix=11.parquet b/tests/data/small_sky_object_catalog/dataset/Norder=0/Dir=0/Npix=11.parquet new file mode 100644 index 00000000..af44a761 Binary files /dev/null and b/tests/data/small_sky_object_catalog/dataset/Norder=0/Dir=0/Npix=11.parquet differ diff --git a/tests/data/small_sky_object_catalog/dataset/_common_metadata b/tests/data/small_sky_object_catalog/dataset/_common_metadata new file mode 100644 index 00000000..b473e52c Binary files /dev/null and b/tests/data/small_sky_object_catalog/dataset/_common_metadata differ diff --git a/tests/data/small_sky_object_catalog/dataset/_metadata b/tests/data/small_sky_object_catalog/dataset/_metadata new file mode 100644 index 00000000..b216cda6 Binary files /dev/null and b/tests/data/small_sky_object_catalog/dataset/_metadata differ diff --git a/tests/data/small_sky_object_catalog/partition_info.csv b/tests/data/small_sky_object_catalog/partition_info.csv new file mode 100644 index 00000000..bf77935e --- /dev/null +++ b/tests/data/small_sky_object_catalog/partition_info.csv @@ -0,0 +1,2 @@ +Norder,Npix +0,11 diff --git a/tests/data/small_sky_object_catalog/point_map.fits b/tests/data/small_sky_object_catalog/point_map.fits new file mode 100644 index 00000000..1b6b6291 Binary files /dev/null and b/tests/data/small_sky_object_catalog/point_map.fits differ diff --git a/tests/data/small_sky_object_catalog/properties b/tests/data/small_sky_object_catalog/properties new file mode 100644 index 00000000..0a7630fc --- /dev/null +++ b/tests/data/small_sky_object_catalog/properties @@ -0,0 +1,14 @@ +#HATS catalog +obs_collection=small_sky_object_catalog +dataproduct_type=object +hats_nrows=131 +hats_col_ra=ra +hats_col_dec=dec +hats_max_rows=1000000 +hats_order=0 +moc_sky_fraction=0.08333 +hats_builder=hats-import v0.3.6.dev26+g40366b4 +hats_creation_date=2024-10-11T15\:02UTC +hats_estsize=74 +hats_release_date=2024-09-18 +hats_version=v0.1 diff --git a/tests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv b/tests/data/small_sky_parts/catalog_00_of_05.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv rename to tests/data/small_sky_parts/catalog_00_of_05.csv diff --git a/tests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv b/tests/data/small_sky_parts/catalog_01_of_05.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv rename to tests/data/small_sky_parts/catalog_01_of_05.csv diff --git a/tests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv b/tests/data/small_sky_parts/catalog_02_of_05.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv rename to tests/data/small_sky_parts/catalog_02_of_05.csv diff --git a/tests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv b/tests/data/small_sky_parts/catalog_03_of_05.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv rename to tests/data/small_sky_parts/catalog_03_of_05.csv diff --git a/tests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv b/tests/data/small_sky_parts/catalog_04_of_05.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv rename to tests/data/small_sky_parts/catalog_04_of_05.csv diff --git a/tests/hipscat_import/data/small_sky_parts/catalog_10_of_05.csv b/tests/data/small_sky_parts/catalog_10_of_05.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_parts/catalog_10_of_05.csv rename to tests/data/small_sky_parts/catalog_10_of_05.csv diff --git a/tests/hipscat_import/data/small_sky_source/small_sky_source.csv b/tests/data/small_sky_source/small_sky_source.csv similarity index 100% rename from tests/hipscat_import/data/small_sky_source/small_sky_source.csv rename to tests/data/small_sky_source/small_sky_source.csv diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=0/Dir=0/Npix=4.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=0/Dir=0/Npix=4.parquet new file mode 100644 index 00000000..e52f7e2e Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=0/Dir=0/Npix=4.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=1/Dir=0/Npix=47.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=1/Dir=0/Npix=47.parquet new file mode 100644 index 00000000..ce27aded Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=1/Dir=0/Npix=47.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=176.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=176.parquet new file mode 100644 index 00000000..b2503107 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=176.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=177.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=177.parquet new file mode 100644 index 00000000..c49db625 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=177.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=178.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=178.parquet new file mode 100644 index 00000000..fc6d65db Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=178.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=179.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=179.parquet new file mode 100644 index 00000000..a8a6fea2 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=179.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=180.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=180.parquet new file mode 100644 index 00000000..fb92ec4c Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=180.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=181.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=181.parquet new file mode 100644 index 00000000..a124074a Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=181.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=182.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=182.parquet new file mode 100644 index 00000000..7693c764 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=182.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=183.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=183.parquet new file mode 100644 index 00000000..d53d7d38 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=183.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=184.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=184.parquet new file mode 100644 index 00000000..df02fae2 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=184.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=185.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=185.parquet new file mode 100644 index 00000000..d112aca5 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=185.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=186.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=186.parquet new file mode 100644 index 00000000..3964dc93 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=186.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=187.parquet b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=187.parquet new file mode 100644 index 00000000..d1ef8c78 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/Norder=2/Dir=0/Npix=187.parquet differ diff --git a/tests/data/small_sky_source_catalog/dataset/_common_metadata b/tests/data/small_sky_source_catalog/dataset/_common_metadata new file mode 100644 index 00000000..2aa9b2e5 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/_common_metadata differ diff --git a/tests/data/small_sky_source_catalog/dataset/_metadata b/tests/data/small_sky_source_catalog/dataset/_metadata new file mode 100644 index 00000000..c184e061 Binary files /dev/null and b/tests/data/small_sky_source_catalog/dataset/_metadata differ diff --git a/tests/data/small_sky_source_catalog/partition_info.csv b/tests/data/small_sky_source_catalog/partition_info.csv new file mode 100644 index 00000000..02b94f10 --- /dev/null +++ b/tests/data/small_sky_source_catalog/partition_info.csv @@ -0,0 +1,15 @@ +Norder,Npix +0,4 +1,47 +2,176 +2,177 +2,178 +2,179 +2,180 +2,181 +2,182 +2,183 +2,184 +2,185 +2,186 +2,187 diff --git a/tests/data/small_sky_source_catalog/point_map.fits b/tests/data/small_sky_source_catalog/point_map.fits new file mode 100644 index 00000000..4337a2c4 Binary files /dev/null and b/tests/data/small_sky_source_catalog/point_map.fits differ diff --git a/tests/data/small_sky_source_catalog/properties b/tests/data/small_sky_source_catalog/properties new file mode 100644 index 00000000..8f1b7900 --- /dev/null +++ b/tests/data/small_sky_source_catalog/properties @@ -0,0 +1,14 @@ +#HATS catalog +obs_collection=small_sky_source_catalog +dataproduct_type=source +hats_nrows=17161 +hats_col_ra=source_ra +hats_col_dec=source_dec +hats_max_rows=3000 +hats_order=2 +moc_sky_fraction=0.16667 +hats_builder=hats-import v0.3.6.dev26+g40366b4 +hats_creation_date=2024-10-11T15\:02UTC +hats_estsize=1105 +hats_release_date=2024-09-18 +hats_version=v0.1 diff --git a/tests/hipscat_import/data/soap_intermediate/0_4.csv b/tests/data/soap_intermediate/0_4.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/0_4.csv rename to tests/data/soap_intermediate/0_4.csv diff --git a/tests/hipscat_import/data/soap_intermediate/1_47.csv b/tests/data/soap_intermediate/1_47.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/1_47.csv rename to tests/data/soap_intermediate/1_47.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_176.csv b/tests/data/soap_intermediate/2_176.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_176.csv rename to tests/data/soap_intermediate/2_176.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_177.csv b/tests/data/soap_intermediate/2_177.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_177.csv rename to tests/data/soap_intermediate/2_177.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_178.csv b/tests/data/soap_intermediate/2_178.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_178.csv rename to tests/data/soap_intermediate/2_178.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_179.csv b/tests/data/soap_intermediate/2_179.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_179.csv rename to tests/data/soap_intermediate/2_179.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_180.csv b/tests/data/soap_intermediate/2_180.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_180.csv rename to tests/data/soap_intermediate/2_180.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_181.csv b/tests/data/soap_intermediate/2_181.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_181.csv rename to tests/data/soap_intermediate/2_181.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_182.csv b/tests/data/soap_intermediate/2_182.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_182.csv rename to tests/data/soap_intermediate/2_182.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_183.csv b/tests/data/soap_intermediate/2_183.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_183.csv rename to tests/data/soap_intermediate/2_183.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_184.csv b/tests/data/soap_intermediate/2_184.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_184.csv rename to tests/data/soap_intermediate/2_184.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_185.csv b/tests/data/soap_intermediate/2_185.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_185.csv rename to tests/data/soap_intermediate/2_185.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_186.csv b/tests/data/soap_intermediate/2_186.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_186.csv rename to tests/data/soap_intermediate/2_186.csv diff --git a/tests/hipscat_import/data/soap_intermediate/2_187.csv b/tests/data/soap_intermediate/2_187.csv similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/2_187.csv rename to tests/data/soap_intermediate/2_187.csv diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_0_4.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_0_4.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_0_4.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_0_4.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_1_47.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_1_47.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_1_47.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_1_47.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_176.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_176.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_176.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_176.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_177.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_177.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_177.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_177.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_178.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_178.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_178.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_178.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_179.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_179.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_179.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_179.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_180.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_180.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_180.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_180.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_181.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_181.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_181.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_181.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_182.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_182.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_182.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_182.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_183.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_183.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_183.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_183.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_184.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_184.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_184.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_184.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_185.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_185.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_185.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_185.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_186.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_186.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_186.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_186.parquet diff --git a/tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_187.parquet b/tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_187.parquet similarity index 100% rename from tests/hipscat_import/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_187.parquet rename to tests/data/soap_intermediate/order_0/dir_0/pixel_11/source_2_187.parquet diff --git a/tests/hipscat_import/data/test_formats/catalog.csv.gz b/tests/data/test_formats/catalog.csv.gz similarity index 100% rename from tests/hipscat_import/data/test_formats/catalog.csv.gz rename to tests/data/test_formats/catalog.csv.gz diff --git a/tests/hipscat_import/data/test_formats/catalog.starr b/tests/data/test_formats/catalog.starr similarity index 100% rename from tests/hipscat_import/data/test_formats/catalog.starr rename to tests/data/test_formats/catalog.starr diff --git a/tests/hipscat_import/data/test_formats/catalog.zip b/tests/data/test_formats/catalog.zip similarity index 100% rename from tests/hipscat_import/data/test_formats/catalog.zip rename to tests/data/test_formats/catalog.zip diff --git a/tests/hipscat_import/data/test_formats/gaia_epoch.ecsv b/tests/data/test_formats/gaia_epoch.ecsv similarity index 100% rename from tests/hipscat_import/data/test_formats/gaia_epoch.ecsv rename to tests/data/test_formats/gaia_epoch.ecsv diff --git a/tests/hipscat_import/data/test_formats/gaia_minimum.csv b/tests/data/test_formats/gaia_minimum.csv similarity index 100% rename from tests/hipscat_import/data/test_formats/gaia_minimum.csv rename to tests/data/test_formats/gaia_minimum.csv diff --git a/tests/hipscat_import/data/test_formats/gaia_minimum_schema.parquet b/tests/data/test_formats/gaia_minimum_schema.parquet similarity index 100% rename from tests/hipscat_import/data/test_formats/gaia_minimum_schema.parquet rename to tests/data/test_formats/gaia_minimum_schema.parquet diff --git a/tests/hipscat_import/data/test_formats/headers.csv b/tests/data/test_formats/headers.csv similarity index 100% rename from tests/hipscat_import/data/test_formats/headers.csv rename to tests/data/test_formats/headers.csv diff --git a/tests/data/test_formats/healpix_29_index.parquet b/tests/data/test_formats/healpix_29_index.parquet new file mode 100644 index 00000000..5c07ce3b Binary files /dev/null and b/tests/data/test_formats/healpix_29_index.parquet differ diff --git a/tests/hipscat_import/data/test_formats/macauff_metadata.yaml b/tests/data/test_formats/macauff_metadata.yaml similarity index 100% rename from tests/hipscat_import/data/test_formats/macauff_metadata.yaml rename to tests/data/test_formats/macauff_metadata.yaml diff --git a/tests/hipscat_import/data/test_formats/pandasindex.parquet b/tests/data/test_formats/pandasindex.parquet similarity index 100% rename from tests/hipscat_import/data/test_formats/pandasindex.parquet rename to tests/data/test_formats/pandasindex.parquet diff --git a/tests/hipscat_import/data/test_formats/pipe_delimited.csv b/tests/data/test_formats/pipe_delimited.csv similarity index 100% rename from tests/hipscat_import/data/test_formats/pipe_delimited.csv rename to tests/data/test_formats/pipe_delimited.csv diff --git a/tests/hipscat_import/data/test_formats/small_sky.fits b/tests/data/test_formats/small_sky.fits similarity index 100% rename from tests/hipscat_import/data/test_formats/small_sky.fits rename to tests/data/test_formats/small_sky.fits diff --git a/tests/data/test_formats/spatial_index.csv b/tests/data/test_formats/spatial_index.csv new file mode 100644 index 00000000..e58df533 --- /dev/null +++ b/tests/data/test_formats/spatial_index.csv @@ -0,0 +1,132 @@ +id,_healpix_29,magnitude,nobs +707,3187422220181831680,22.13496609,264 +792,3187796123454537728,6.487240283,395 +811,3188300701661921280,23.7801059,268 +723,3188300701661921280,22.86223173,426 +826,3192670279995228160,18.01813779,338 +750,3192995164287139840,24.78617356,364 +771,3194102393992118272,23.11024818,389 +734,3195678697494413312,21.40031147,479 +738,3196676706683322368,13.78825467,457 +772,3196723640945082368,4.188779415,336 +776,3197084959829393408,12.40395764,212 +733,3199487976389214208,8.970635074,217 +804,3200256676289576960,2.651958506,451 +747,3204516948860731392,8.884322517,288 +739,3205876081882038272,4.29767576,332 +816,3210595332877451264,21.90158694,413 +703,3210618432891387904,24.08464986,247 +794,3213763510983983104,12.84586391,410 +735,3214195389014999040,11.25435057,386 +797,3214969534743052288,3.39664171,308 +815,3216746212972560384,8.873824597,347 +748,3220523316512030720,0.7007125911,221 +716,3221572881415667712,10.69933855,239 +807,3221644366134116352,21.07926395,466 +768,3221942678435397632,7.656387426,472 +729,3222029369621872640,20.04342103,474 +810,3222093801031991296,10.51822722,445 +718,3222606439509917696,19.06826935,322 +818,3224426300283289600,13.12773551,380 +766,3225326185518989312,9.016078942,281 +730,3226002888984625152,7.923479941,463 +758,3231100210200444928,19.96407461,202 +780,3231184305676877824,12.77423302,250 +775,3231700866781151232,4.732638481,432 +760,3231878325195505664,13.65187931,331 +795,3233808982978068480,25.40947285,364 +822,3236559609654149120,12.71703295,471 +736,3236880878436089856,11.43843929,444 +801,3237494352309649408,20.47232202,315 +830,3237753854591238144,15.73427632,417 +817,3239484224248479744,12.05402411,346 +787,3239635329516306432,0.173058202,318 +812,3245124716102418432,0.06359345768,208 +722,3246262717484367872,25.60136077,381 +731,3256317681612095488,6.948013224,310 +720,3257765200566157312,12.14407631,360 +823,3260117115292614656,13.46433211,487 +742,3263971244188368896,15.54236222,480 +719,3273290000274292736,12.6330677,435 +710,3273594569313222656,18.02127743,316 +726,3273829406168055808,5.502478521,462 +744,3274444766326030336,0.8044727764,209 +813,3275039327016452096,17.49730373,271 +757,3277296053784674304,6.925286532,483 +821,3278748473083559936,6.87906273,454 +762,3279291389443047424,13.02548232,345 +728,3280519485070508032,12.44919001,380 +781,3280802192601120768,12.0115697,417 +704,3282636638231986176,24.91128218,430 +751,3283894517638365184,24.91242751,372 +724,3289601756302934016,16.88580167,421 +808,3291070806175514624,24.24020062,490 +784,3296723682484813824,13.33239374,225 +732,3296863419443970048,13.31989657,258 +745,3297480448190447616,25.75526883,405 +786,3300600436121141248,14.17373488,371 +705,3300775760909828096,21.0010083,269 +779,3302771647140855808,1.907256896,492 +761,3308757303243571200,4.441992632,207 +828,3309847189889482752,1.459992705,354 +803,3312697108462567424,20.88998098,233 +788,3315911807728615424,16.89976725,334 +700,3318157971330957312,17.50204436,497 +793,3319374857273081856,11.03349796,373 +749,3320852365814267904,16.3051455,268 +805,3321246044863332352,18.38673631,351 +773,3323329198194425856,7.637464207,252 +774,3325242552886493184,16.00879042,469 +712,3329217475893002240,14.92672305,453 +759,3329913878876258304,9.060606183,393 +820,3331427345701535744,8.722036752,393 +789,3331529653644951552,17.74075663,292 +711,3333910191588507648,18.97132992,356 +802,3333964020129464320,22.33402335,295 +701,3335348517171363840,1.17742214,263 +727,3336827918335606784,19.59159969,353 +717,3337000956645605376,13.54006347,202 +753,3337786698351247360,8.367684293,285 +769,3339749652318650368,23.5473658,325 +725,3339833371228372992,8.571007577,432 +827,3340634127750660096,7.048240895,284 +777,3341153232084795392,15.69992175,478 +764,3342097127818919936,2.729025506,371 +785,3342370595083911168,24.20554709,285 +709,3342378539155456000,19.52493845,473 +713,3343552655515451392,25.88972528,336 +800,3345607350541090816,22.99430092,428 +706,3346150369862352896,18.74524258,287 +755,3346840175423520768,20.52555018,377 +741,3346840175423520768,4.868075987,278 +714,3347083653898305536,12.67035091,334 +763,3347303042623995904,5.129770839,441 +708,3347377290775363584,6.418179256,477 +765,3348147488165986304,1.538183169,482 +740,3348397106555518976,3.147345677,340 +783,3356290493674684416,11.01365058,230 +790,3365700046305624064,0.8847339055,456 +809,3366308343492640768,25.57341722,253 +715,3366847976645328896,17.7427978,477 +782,3369301736590147584,14.58925323,231 +752,3372246530833514496,10.62019163,320 +746,3380119216995696640,24.33242981,297 +770,3380458994856361984,25.87566821,339 +756,3388235695416934400,16.94841816,342 +798,3388424365484802048,12.41856805,315 +778,3389280889354584064,19.34416403,297 +829,3389344265064677376,24.98975232,446 +819,3389454143235031040,18.2879184,367 +814,3390042224873963520,18.10171084,444 +721,3390233494164602880,5.911152016,423 +737,3390395511632560128,8.714697049,310 +799,3390927915493359616,3.390950561,267 +825,3391172539242774528,4.077103157,389 +796,3391463069395714048,21.03770104,418 +754,3397177333028749312,20.86431444,408 +806,3397704562974392320,13.50630832,308 +791,3397804200316370944,16.70345827,263 +824,3399000453069930496,11.11028334,243 +702,3399532867185803264,17.0456881,429 +767,3400255793564483584,12.99882524,476 +743,3424180623568273408,19.0979479,428 diff --git a/tests/hipscat_import/catalog/test_argument_validation.py b/tests/hats_import/catalog/test_argument_validation.py similarity index 79% rename from tests/hipscat_import/catalog/test_argument_validation.py rename to tests/hats_import/catalog/test_argument_validation.py index 2f98f431..b6c827f9 100644 --- a/tests/hipscat_import/catalog/test_argument_validation.py +++ b/tests/hats_import/catalog/test_argument_validation.py @@ -1,10 +1,8 @@ """Tests of argument validation""" import pytest -from hipscat.io import write_metadata -from hipscat_import.catalog.arguments import ImportArguments, check_healpix_order_range -from hipscat_import.catalog.file_readers import CsvReader +from hats_import.catalog.arguments import ImportArguments, check_healpix_order_range # pylint: disable=protected-access @@ -185,14 +183,14 @@ def test_catalog_type(blank_data_dir, tmp_path): ) -def test_use_hipscat_index(blank_data_dir, tmp_path): +def test_use_healpix_29(blank_data_dir, tmp_path): with pytest.raises(ValueError, match="no sort columns should be added"): ImportArguments( output_artifact_name="catalog", input_path=blank_data_dir, file_reader="csv", output_path=tmp_path, - use_hipscat_index=True, + use_healpix_29=True, sort_columns="foo", ) ImportArguments( @@ -200,12 +198,12 @@ def test_use_hipscat_index(blank_data_dir, tmp_path): input_path=blank_data_dir, file_reader="csv", output_path=tmp_path, - use_hipscat_index=True, + use_healpix_29=True, sort_columns="", # empty string is ok ) -def test_to_catalog_info(blank_data_dir, tmp_path): +def test_to_table_properties(blank_data_dir, tmp_path): """Verify creation of catalog parameters for catalog to be created.""" args = ImportArguments( output_artifact_name="catalog", @@ -214,56 +212,13 @@ def test_to_catalog_info(blank_data_dir, tmp_path): output_path=tmp_path, tmp_dir=tmp_path, progress_bar=False, + addl_hats_properties={"hats_cols_default": "id, mjd", "obs_regime": "Optical"}, ) - catalog_info = args.to_catalog_info(total_rows=10) + catalog_info = args.to_table_properties(total_rows=10, highest_order=4, moc_sky_fraction=22 / 7) assert catalog_info.catalog_name == "catalog" assert catalog_info.total_rows == 10 - - -def test_provenance_info(blank_data_dir, tmp_path): - """Verify that provenance info includes catalog-specific fields.""" - args = ImportArguments( - output_artifact_name="catalog", - input_path=blank_data_dir, - file_reader="csv", - output_path=tmp_path, - tmp_dir=tmp_path, - progress_bar=False, - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "epoch" in runtime_args - - -def test_write_provenance_info(formats_dir, tmp_path): - """Verify that provenance info can be written to JSON file.""" - input_file = formats_dir / "gaia_minimum.csv" - schema_file = formats_dir / "gaia_minimum_schema.parquet" - - args = ImportArguments( - output_artifact_name="gaia_minimum", - input_file_list=[input_file], - file_reader=CsvReader( - comment="#", - header=None, - schema_file=schema_file, - ), - ra_column="ra", - dec_column="dec", - sort_columns="solution_id", - use_schema_file=schema_file, - output_path=tmp_path, - dask_tmp=tmp_path, - highest_healpix_order=2, - pixel_threshold=3_000, - progress_bar=False, - ) - - write_metadata.write_provenance_info( - catalog_base_dir=args.catalog_path, - dataset_info=args.to_catalog_info(0), - tool_args=args.provenance_info(), - ) + assert catalog_info.default_columns == ["id", "mjd"] + assert catalog_info.__pydantic_extra__["obs_regime"] == "Optical" def test_check_healpix_order_range(): @@ -274,8 +229,8 @@ def test_check_healpix_order_range(): with pytest.raises(ValueError, match="positive"): check_healpix_order_range(5, "order_field", lower_bound=-1) - with pytest.raises(ValueError, match="19"): - check_healpix_order_range(5, "order_field", upper_bound=20) + with pytest.raises(ValueError, match="29"): + check_healpix_order_range(5, "order_field", upper_bound=30) with pytest.raises(ValueError, match="order_field"): check_healpix_order_range(-1, "order_field") diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hats_import/catalog/test_file_readers.py similarity index 84% rename from tests/hipscat_import/catalog/test_file_readers.py rename to tests/hats_import/catalog/test_file_readers.py index 3bab85df..de95e3c8 100644 --- a/tests/hipscat_import/catalog/test_file_readers.py +++ b/tests/hats_import/catalog/test_file_readers.py @@ -1,14 +1,13 @@ """Test dataframe-generating file readers""" -import hipscat.io.write_metadata as io import numpy as np import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pytest -from hipscat.catalog.catalog import CatalogInfo +from hats.catalog import TableProperties -from hipscat_import.catalog.file_readers import ( +from hats_import.catalog.file_readers import ( CsvReader, FitsReader, IndexedCsvReader, @@ -28,7 +27,7 @@ def basic_catalog_info(): "ra_column": "ra", "dec_column": "dec", } - return CatalogInfo(**info) + return TableProperties(**info) def test_unknown_file_type(): @@ -230,33 +229,6 @@ def test_csv_reader_pipe_delimited(formats_pipe_csv, tmp_path): assert np.all(column_types == expected_column_types) -def test_csv_reader_provenance_info(tmp_path, basic_catalog_info): - """Test that we get some provenance info and it is parseable into JSON.""" - reader = CsvReader( - header=None, - sep="|", - column_names=["letters", "ints", "empty", "numeric"], - type_map={ - "letters": object, - "ints": int, - "empty": "Int64", - "numeric": int, - }, - storage_options={"user_name": "user_pii", "user_key": "SECRETS!"}, - ) - provenance_info = reader.provenance_info() - catalog_base_dir = tmp_path / "test_catalog" - catalog_base_dir.mkdir(parents=True) - io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) - - with open(catalog_base_dir / "provenance_info.json", "r", encoding="utf-8") as file: - data = file.read() - assert "test_catalog" in data - assert "REDACTED" in data - assert "user_pii" not in data - assert "SECRETS" not in data - - def test_indexed_csv_reader(indexed_files_dir): # Chunksize covers all the inputs. total_chunks = 0 @@ -330,15 +302,6 @@ def test_indexed_parquet_reader(indexed_files_dir): assert total_chunks == 29 -def test_parquet_reader_provenance_info(tmp_path, basic_catalog_info): - """Test that we get some provenance info and it is parseable into JSON.""" - reader = ParquetReader(chunksize=1) - provenance_info = reader.provenance_info() - catalog_base_dir = tmp_path / "test_catalog" - catalog_base_dir.mkdir(parents=True) - io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) - - def test_parquet_reader_columns(parquet_shards_shard_44_0): """Verify we can read a subset of columns.""" column_subset = ["id", "dec"] @@ -387,12 +350,3 @@ def test_read_fits_columns(formats_fits): FitsReader(skip_column_names=["ra_error", "dec_error"]).read(formats_fits, read_columns=["ra", "dec"]) ) assert list(frame.columns) == ["ra", "dec"] - - -def test_fits_reader_provenance_info(tmp_path, basic_catalog_info): - """Test that we get some provenance info and it is parseable into JSON.""" - reader = FitsReader() - provenance_info = reader.provenance_info() - catalog_base_dir = tmp_path / "test_catalog" - catalog_base_dir.mkdir(parents=True) - io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) diff --git a/tests/hipscat_import/catalog/test_map_reduce.py b/tests/hats_import/catalog/test_map_reduce.py similarity index 93% rename from tests/hipscat_import/catalog/test_map_reduce.py rename to tests/hats_import/catalog/test_map_reduce.py index 0cc67580..2e509dfb 100644 --- a/tests/hipscat_import/catalog/test_map_reduce.py +++ b/tests/hats_import/catalog/test_map_reduce.py @@ -4,18 +4,18 @@ import pickle from io import StringIO -import hipscat.pixel_math as hist -import hipscat.pixel_math.healpix_shim as hp +import hats.pixel_math as hist +import hats.pixel_math.healpix_shim as hp import numpy as np import numpy.testing as npt import pandas as pd import pyarrow as pa import pytest -import hipscat_import.catalog.map_reduce as mr -from hipscat_import.catalog.file_readers import get_file_reader -from hipscat_import.catalog.resume_plan import ResumePlan -from hipscat_import.catalog.sparse_histogram import SparseHistogram +import hats_import.catalog.map_reduce as mr +from hats_import.catalog.file_readers import get_file_reader +from hats_import.catalog.resume_plan import ResumePlan +from hats_import.catalog.sparse_histogram import SparseHistogram def pickle_file_reader(tmp_path, file_reader) -> str: @@ -148,16 +148,16 @@ def test_map_headers(tmp_path, formats_headers_csv): assert (result == expected).all() -def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): +def test_map_with_healpix_29(tmp_path, formats_dir, small_sky_single_file): (tmp_path / "histograms").mkdir(parents=True) - input_file = formats_dir / "hipscat_index.csv" + input_file = formats_dir / "spatial_index.csv" mr.map_to_pixels( input_file=input_file, pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("csv")), highest_order=0, ra_column="NOPE", dec_column="NOPE", - use_hipscat_index=True, # radec don't matter. just use existing index + use_healpix_29=True, # radec don't matter. just use existing index resume_path=tmp_path, mapping_key="map_0", ) @@ -175,7 +175,7 @@ def test_map_with_hipscat_index(tmp_path, formats_dir, small_sky_single_file): highest_order=0, ra_column="NOPE", dec_column="NOPE", - use_hipscat_index=True, # no pre-existing index! expect failure. + use_healpix_29=True, # no pre-existing index! expect failure. resume_path=tmp_path, mapping_key="map_0", ) @@ -318,21 +318,21 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): destination_pixel_number=11, destination_pixel_size=131, output_path=tmp_path, - add_hipscat_index=True, + add_healpix_29=True, ra_column="ra", dec_column="dec", sort_columns="id", delete_input_files=False, ) - output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" + output_file = tmp_path / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) -def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_path): - """Test reducing with or without a _hipscat_index field""" +def test_reduce_healpix_29(parquet_shards_dir, assert_parquet_file_ids, tmp_path): + """Test reducing with or without a _healpix_29 field""" (tmp_path / "reducing").mkdir(parents=True) mr.reduce_pixel_shards( cache_shard_path=parquet_shards_dir, @@ -348,12 +348,12 @@ def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_p delete_input_files=False, ) - output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" + output_file = tmp_path / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) data_frame = pd.read_parquet(output_file, engine="pyarrow") - assert data_frame.index.name == "_hipscat_index" + assert data_frame.index.name == "_healpix_29" npt.assert_array_equal( data_frame.columns, ["id", "ra", "dec", "ra_error", "dec_error", "Norder", "Dir", "Npix"], @@ -367,7 +367,7 @@ def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_p destination_pixel_number=11, destination_pixel_size=131, output_path=tmp_path, - add_hipscat_index=False, ## different from above + add_healpix_29=False, ## different from above ra_column="ra", dec_column="dec", sort_columns="id", @@ -408,7 +408,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): Logically, the input data has a mix of orderings in files, object IDs, and timestamps. Each source is partitioned according to the linked object's radec, and so will be - ordered within the same hipscat_index value. + ordered within the same spatial_index value. First, we take some time to set up these silly data points, then we test out reducing them into a single parquet file using a mix of reduction options. @@ -416,7 +416,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): (tmp_path / "reducing").mkdir(parents=True) shard_dir = tmp_path / "reduce_shards" / "order_0" / "dir_0" / "pixel_11" shard_dir.mkdir(parents=True) - output_file = tmp_path / "Norder=0" / "Dir=0" / "Npix=11.parquet" + output_file = tmp_path / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" file1_string = """source_id,object_id,time,ra,dec 1200,700,3000,282.5,-58.5 @@ -444,7 +444,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): lonlat=True, nest=True, ) - ## Use this to prune generated columns like Norder, Npix, and _hipscat_index + ## Use this to prune generated columns like Norder, Npix, and _healpix_29 comparison_columns = ["source_id", "object_id", "time", "ra", "dec"] ######################## Sort option 1: by source_id @@ -520,7 +520,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): resort_ids=False, ) - ######################## Sort option 3: by object id and time WITHOUT hipscat index. + ######################## Sort option 3: by object id and time WITHOUT spatial index. ## The 1500 block of ids goes back to the end, because we're not using ## spatial properties for sorting, only numeric. ## sort order is effectively (object id, time) @@ -535,7 +535,7 @@ def test_reduce_with_sorting_complex(assert_parquet_file_ids, tmp_path): ra_column="ra", dec_column="dec", sort_columns="object_id,time", - add_hipscat_index=False, + add_healpix_29=False, delete_input_files=False, ) diff --git a/tests/hipscat_import/catalog/test_resume_plan.py b/tests/hats_import/catalog/test_resume_plan.py similarity index 98% rename from tests/hipscat_import/catalog/test_resume_plan.py rename to tests/hats_import/catalog/test_resume_plan.py index 705e6079..73fb1f11 100644 --- a/tests/hipscat_import/catalog/test_resume_plan.py +++ b/tests/hats_import/catalog/test_resume_plan.py @@ -4,8 +4,8 @@ import numpy.testing as npt import pytest -from hipscat_import.catalog.resume_plan import ResumePlan -from hipscat_import.catalog.sparse_histogram import SparseHistogram +from hats_import.catalog.resume_plan import ResumePlan +from hats_import.catalog.sparse_histogram import SparseHistogram def test_done_checks(tmp_path): diff --git a/tests/hipscat_import/catalog/test_run_import.py b/tests/hats_import/catalog/test_run_import.py similarity index 89% rename from tests/hipscat_import/catalog/test_run_import.py rename to tests/hats_import/catalog/test_run_import.py index 96612615..afd7f1a1 100644 --- a/tests/hipscat_import/catalog/test_run_import.py +++ b/tests/hats_import/catalog/test_run_import.py @@ -9,13 +9,13 @@ import pyarrow as pa import pyarrow.parquet as pq import pytest -from hipscat.catalog.catalog import Catalog +from hats.catalog.catalog import Catalog -import hipscat_import.catalog.run_import as runner -from hipscat_import.catalog.arguments import ImportArguments -from hipscat_import.catalog.file_readers import CsvReader -from hipscat_import.catalog.resume_plan import ResumePlan -from hipscat_import.catalog.sparse_histogram import SparseHistogram +import hats_import.catalog.run_import as runner +from hats_import.catalog.arguments import ImportArguments +from hats_import.catalog.file_readers import CsvReader +from hats_import.catalog.resume_plan import ResumePlan +from hats_import.catalog.sparse_histogram import SparseHistogram def test_empty_args(): @@ -61,7 +61,7 @@ def test_resume_dask_runner( ResumePlan.touch_key_done_file(resume_tmp, ResumePlan.REDUCING_STAGE, "0_11") - shutil.copytree(resume_dir / "Norder=0", tmp_path / "resume_catalog" / "Norder=0") + shutil.copytree(resume_dir / "Norder=0", tmp_path / "resume_catalog" / "dataset" / "Norder=0") args = ImportArguments( output_artifact_name="resume_catalog", @@ -79,7 +79,7 @@ def test_resume_dask_runner( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.ra_column == "ra" @@ -88,7 +88,7 @@ def test_resume_dask_runner( assert len(catalog.get_healpix_pixels()) == 1 # Check that the catalog parquet file exists and contains correct object IDs - output_file = Path(args.catalog_path) / "Norder=0" / "Dir=0" / "Npix=11.parquet" + output_file = Path(args.catalog_path) / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) @@ -111,7 +111,7 @@ def test_resume_dask_runner( runner.run(args, dask_client) - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.ra_column == "ra" @@ -177,7 +177,7 @@ def test_resume_dask_runner_diff_pixel_order( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.ra_column == "ra" @@ -186,9 +186,8 @@ def test_resume_dask_runner_diff_pixel_order( assert len(catalog.get_healpix_pixels()) == 4 for n_pix in range(44, 48): - filename = os.path.join("Norder=1", "Dir=0", f"Npix={n_pix}.parquet") - output_filepath = os.path.join(args.catalog_path, filename) - expected_filepath = os.path.join(resume_dir, filename) + output_filepath = args.catalog_path / "dataset" / "Norder=1" / "Dir=0" / f"Npix={n_pix}.parquet" + expected_filepath = resume_dir / "Norder=1" / "Dir=0" / f"Npix={n_pix}.parquet" expected_file = pd.read_parquet(expected_filepath, engine="pyarrow") assert_parquet_file_ids(output_filepath, "id", expected_file["id"].to_numpy()) @@ -262,7 +261,7 @@ def test_dask_runner( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.ra_column == "ra" @@ -271,7 +270,7 @@ def test_dask_runner( assert len(catalog.get_healpix_pixels()) == 1 # Check that the catalog parquet file exists and contains correct object IDs - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) @@ -287,12 +286,12 @@ def test_dask_runner( pa.field("Norder", pa.uint8()), pa.field("Dir", pa.uint64()), pa.field("Npix", pa.uint64()), - pa.field("_hipscat_index", pa.uint64()), + pa.field("_healpix_29", pa.int64()), ] ) schema = pq.read_metadata(output_file).schema.to_arrow_schema() assert schema.equals(expected_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema.to_arrow_schema() assert schema.equals(expected_parquet_schema, check_metadata=False) # Check that, when re-loaded as a pandas dataframe, the appropriate numeric types are used. @@ -310,7 +309,7 @@ def test_dask_runner( } ) assert data_frame.dtypes.equals(expected_dtypes) - assert data_frame.index.dtype == np.uint64 + assert data_frame.index.dtype == np.int64 @pytest.mark.dask @@ -329,15 +328,15 @@ def test_dask_runner_stats_only(dask_client, small_sky_parts_dir, tmp_path): runner.run(args, dask_client) - metadata_filename = os.path.join(args.catalog_path, "catalog_info.json") + metadata_filename = os.path.join(args.catalog_path, "properties") assert os.path.exists(metadata_filename) # Check that the catalog parquet file DOES NOT exist - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") assert not os.path.exists(output_file) - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.ra_column == "ra" diff --git a/tests/hipscat_import/catalog/test_run_round_trip.py b/tests/hats_import/catalog/test_run_round_trip.py similarity index 87% rename from tests/hipscat_import/catalog/test_run_round_trip.py rename to tests/hats_import/catalog/test_run_round_trip.py index b488f320..2276a824 100644 --- a/tests/hipscat_import/catalog/test_run_round_trip.py +++ b/tests/hats_import/catalog/test_run_round_trip.py @@ -15,12 +15,12 @@ import pyarrow.dataset as pds import pyarrow.parquet as pq import pytest -from hipscat.catalog.catalog import Catalog -from hipscat.pixel_math.hipscat_id import hipscat_id_to_healpix +from hats.catalog.catalog import Catalog +from hats.pixel_math.spatial_index import spatial_index_to_healpix -import hipscat_import.catalog.run_import as runner -from hipscat_import.catalog.arguments import ImportArguments -from hipscat_import.catalog.file_readers import CsvReader, get_file_reader +import hats_import.catalog.run_import as runner +from hats_import.catalog.arguments import ImportArguments +from hats_import.catalog.file_readers import CsvReader, get_file_reader @pytest.mark.dask @@ -51,7 +51,7 @@ def test_import_source_table( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.ra_column == "source_ra" @@ -94,7 +94,7 @@ def test_import_mixed_schema_csv( runner.run(args, dask_client) # Check that the catalog parquet file exists - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") assert_parquet_file_ids(output_file, "id", [*range(700, 708)]) @@ -111,12 +111,12 @@ def test_import_mixed_schema_csv( pa.field("Norder", pa.uint8()), pa.field("Dir", pa.uint64()), pa.field("Npix", pa.uint64()), - pa.field("_hipscat_index", pa.uint64()), + pa.field("_healpix_29", pa.int64()), ] ) schema = pq.read_metadata(output_file).schema.to_arrow_schema() assert schema.equals(expected_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema.to_arrow_schema() assert schema.equals(expected_parquet_schema, check_metadata=False) @@ -153,13 +153,13 @@ def test_import_preserve_index( ["obj_id", "band", "ra", "dec", "mag"], ) - ## Don't generate a hipscat index. Verify that the original index remains. + ## Don't generate a hats spatial index. Verify that the original index remains. args = ImportArguments( output_artifact_name="pandasindex", input_file_list=[formats_pandasindex], file_reader="parquet", sort_columns="obs_id", - add_hipscat_index=False, + add_healpix_29=False, output_path=tmp_path, dask_tmp=tmp_path, highest_healpix_order=1, @@ -169,7 +169,7 @@ def test_import_preserve_index( runner.run(args, dask_client) # Check that the catalog parquet file exists - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") assert_parquet_file_index(output_file, expected_indexes) data_frame = pd.read_parquet(output_file, engine="pyarrow") @@ -179,13 +179,13 @@ def test_import_preserve_index( ["obj_id", "band", "ra", "dec", "mag", "Norder", "Dir", "Npix"], ) - ## DO generate a hipscat index. Verify that the original index is preserved in a column. + ## DO generate a hats spatial index. Verify that the original index is preserved in a column. args = ImportArguments( output_artifact_name="pandasindex_preserve", input_file_list=[formats_pandasindex], file_reader="parquet", sort_columns="obs_id", - add_hipscat_index=True, + add_healpix_29=True, output_path=tmp_path, dask_tmp=tmp_path, highest_healpix_order=1, @@ -195,10 +195,10 @@ def test_import_preserve_index( runner.run(args, dask_client) # Check that the catalog parquet file exists - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") data_frame = pd.read_parquet(output_file, engine="pyarrow") - assert data_frame.index.name == "_hipscat_index" + assert data_frame.index.name == "_healpix_29" npt.assert_array_equal( data_frame.columns, ["obs_id", "obj_id", "band", "ra", "dec", "mag", "Norder", "Dir", "Npix"], @@ -229,14 +229,14 @@ def test_import_constant_healpix_order( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path # Check that the partition info file exists - all pixels at order 2! assert all(pixel.order == 2 for pixel in catalog.partition_info.get_healpix_pixels()) # Pick a parquet file and make sure it contains as many rows as we expect - output_file = os.path.join(args.catalog_path, "Norder=2", "Dir=0", "Npix=178.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=2", "Dir=0", "Npix=178.parquet") data_frame = pd.read_parquet(output_file, engine="pyarrow") assert len(data_frame) == 14 @@ -270,7 +270,7 @@ def test_import_keep_intermediate_files( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path @@ -424,14 +424,14 @@ def test_import_lowest_healpix_order( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path # Check that the partition info file exists - all pixels at order 2! assert all(pixel.order == 2 for pixel in catalog.partition_info.get_healpix_pixels()) # Pick a parquet file and make sure it contains as many rows as we expect - output_file = os.path.join(args.catalog_path, "Norder=2", "Dir=0", "Npix=178.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=2", "Dir=0", "Npix=178.parquet") data_frame = pd.read_parquet(output_file, engine="pyarrow") assert len(data_frame) == 14 @@ -475,46 +475,46 @@ def test_import_starr_file( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.total_rows == 131 assert len(catalog.get_healpix_pixels()) == 1 # Check that the catalog parquet file exists and contains correct object IDs - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) @pytest.mark.dask -def test_import_hipscat_index( +def test_import_healpix_29( dask_client, formats_dir, assert_parquet_file_ids, tmp_path, ): - """Test basic execution, using a previously-computed _hipscat_index column for spatial partitioning.""" + """Test basic execution, using a previously-computed _healpix_29 column for spatial partitioning.""" ## First, let's just check the assumptions we have about our input file: - ## - should have _hipscat_index as the indexed column + ## - should have _healpix_29 as the indexed column ## - should NOT have any columns like "ra" or "dec" - input_file = formats_dir / "hipscat_index.parquet" + input_file = formats_dir / "healpix_29_index.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_ids(input_file, "id", expected_ids) data_frame = pd.read_parquet(input_file, engine="pyarrow") - assert data_frame.index.name == "_hipscat_index" + assert data_frame.index.name == "_healpix_29" npt.assert_array_equal(data_frame.columns, ["id"]) args = ImportArguments( - output_artifact_name="using_hipscat_index", + output_artifact_name="using_healpix_29", input_file_list=[input_file], file_reader="parquet", output_path=tmp_path, dask_tmp=tmp_path, - use_hipscat_index=True, + use_healpix_29=True, highest_healpix_order=2, pixel_threshold=3_000, progress_bar=False, @@ -523,19 +523,19 @@ def test_import_hipscat_index( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.total_rows == 131 assert len(catalog.get_healpix_pixels()) == 1 # Check that the catalog parquet file exists and contains correct object IDs - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) data_frame = pd.read_parquet(output_file, engine="pyarrow") - assert data_frame.index.name == "_hipscat_index" + assert data_frame.index.name == "_healpix_29" npt.assert_array_equal( data_frame.columns, ["id", "Norder", "Dir", "Npix"], @@ -543,21 +543,21 @@ def test_import_hipscat_index( @pytest.mark.dask -def test_import_hipscat_index_no_pandas( +def test_import_healpix_29_no_pandas( dask_client, formats_dir, assert_parquet_file_ids, tmp_path, ): - """Test basic execution, using a previously-computed _hipscat_index column for spatial partitioning.""" - input_file = formats_dir / "hipscat_index.csv" + """Test basic execution, using a previously-computed _healpix_29 column for spatial partitioning.""" + input_file = formats_dir / "spatial_index.csv" args = ImportArguments( - output_artifact_name="using_hipscat_index", + output_artifact_name="using_healpix_29", input_file_list=[input_file], file_reader="csv", output_path=tmp_path, dask_tmp=tmp_path, - use_hipscat_index=True, + use_healpix_29=True, highest_healpix_order=2, pixel_threshold=3_000, progress_bar=False, @@ -566,19 +566,19 @@ def test_import_hipscat_index_no_pandas( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.total_rows == 131 assert len(catalog.get_healpix_pixels()) == 1 # Check that the catalog parquet file exists and contains correct object IDs - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=11.parquet") expected_ids = [*range(700, 831)] assert_parquet_file_ids(output_file, "id", expected_ids) data_frame = pd.read_parquet(output_file, engine="pyarrow") - assert data_frame.index.name == "_hipscat_index" + assert data_frame.index.name == "_healpix_29" npt.assert_array_equal( data_frame.columns, ["id", "magnitude", "nobs", "Norder", "Dir", "Npix"], @@ -616,20 +616,20 @@ def test_import_gaia_minimum( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.total_rows == 5 assert len(catalog.get_healpix_pixels()) == 3 # Pick an output file, and make sure it has valid columns: - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=5.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=5.parquet") data_frame = pd.read_parquet(output_file) - # Make sure that the hipscat index values match the pixel for the partition (0,5) - assert data_frame.index.name == "_hipscat_index" - hipscat_index_pixels = hipscat_id_to_healpix(data_frame.index.values, 0) - npt.assert_array_equal(hipscat_index_pixels, [5, 5, 5]) + # Make sure that the spatial index values match the pixel for the partition (0,5) + assert data_frame.index.name == "_healpix_29" + spatial_index_pixels = spatial_index_to_healpix(data_frame.index.values, 0) + npt.assert_array_equal(spatial_index_pixels, [5, 5, 5]) column_names = data_frame.columns assert "Norder" in column_names @@ -663,13 +663,13 @@ def test_gaia_ecsv( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert catalog.catalog_info.total_rows == 3 assert len(catalog.get_healpix_pixels()) == 1 - output_file = os.path.join(args.catalog_path, "Norder=0", "Dir=0", "Npix=0.parquet") + output_file = os.path.join(args.catalog_path, "dataset", "Norder=0", "Dir=0", "Npix=0.parquet") assert_parquet_file_ids(output_file, "source_id", [10655814178816, 10892037246720, 14263587225600]) @@ -729,7 +729,7 @@ def test_gaia_ecsv( pa.field("Norder", pa.uint8()), pa.field("Dir", pa.uint64()), pa.field("Npix", pa.uint64()), - pa.field("_hipscat_index", pa.uint64()), + pa.field("_healpix_29", pa.int64()), ] ) @@ -742,9 +742,9 @@ def test_gaia_ecsv( schema = pq.read_metadata(output_file).schema.to_arrow_schema() assert schema.equals(expected_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema.to_arrow_schema() assert schema.equals(expected_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_common_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_common_metadata").schema.to_arrow_schema() assert schema.equals(expected_parquet_schema, check_metadata=False) schema = pds.dataset(args.catalog_path, format="parquet").schema assert schema.equals(expected_parquet_schema, check_metadata=False) @@ -782,7 +782,7 @@ def test_import_indexed_csv( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert len(catalog.get_healpix_pixels()) == 1 diff --git a/tests/hipscat_import/catalog/test_sparse_histogram.py b/tests/hats_import/catalog/test_sparse_histogram.py similarity index 97% rename from tests/hipscat_import/catalog/test_sparse_histogram.py rename to tests/hats_import/catalog/test_sparse_histogram.py index 57ce78f2..50b9e68f 100644 --- a/tests/hipscat_import/catalog/test_sparse_histogram.py +++ b/tests/hats_import/catalog/test_sparse_histogram.py @@ -5,7 +5,7 @@ import pytest from scipy.sparse import csr_array -from hipscat_import.catalog.sparse_histogram import SparseHistogram +from hats_import.catalog.sparse_histogram import SparseHistogram def test_read_write_round_trip(tmp_path): diff --git a/tests/hipscat_import/conftest.py b/tests/hats_import/conftest.py similarity index 95% rename from tests/hipscat_import/conftest.py rename to tests/hats_import/conftest.py index 1cd8cbf2..998011c6 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hats_import/conftest.py @@ -8,7 +8,7 @@ import numpy.testing as npt import pandas as pd import pytest -from hipscat import pixel_math +from hats import pixel_math # pylint: disable=missing-function-docstring, redefined-outer-name @@ -52,7 +52,7 @@ def test_long_running(): @pytest.fixture def test_data_dir(): - return Path(TEST_DIR) / "data" + return Path(TEST_DIR).parent / "data" @pytest.fixture @@ -171,12 +171,12 @@ def basic_data_shard_df(): dec = np.full(360, 0.0) norder = np.full(360, 1) npix = np.full(360, 0) - hipscat_indexes = pixel_math.compute_hipscat_id(ras, dec) + spatial_indexes = pixel_math.compute_spatial_index(ras, dec) test_df = pd.DataFrame( - data=zip(hipscat_indexes, ras, dec, norder, npix), + data=zip(spatial_indexes, ras, dec, norder, npix), columns=[ - "_hipscat_index", + "_healpix_29", "weird_ra", "weird_dec", "Norder", @@ -192,12 +192,12 @@ def polar_data_shard_df(): dec = np.full(360, 89.9) norder = np.full(360, 2) npix = np.full(360, 0) - hipscat_indexes = pixel_math.compute_hipscat_id(ras, dec) + spatial_indexes = pixel_math.compute_spatial_index(ras, dec) test_df = pd.DataFrame( - data=zip(hipscat_indexes, ras, dec, norder, npix), + data=zip(spatial_indexes, ras, dec, norder, npix), columns=[ - "_hipscat_index", + "_healpix_29", "weird_ra", "weird_dec", "Norder", diff --git a/tests/hats_import/hipscat_conversion/test_run_conversion.py b/tests/hats_import/hipscat_conversion/test_run_conversion.py new file mode 100644 index 00000000..041f9a90 --- /dev/null +++ b/tests/hats_import/hipscat_conversion/test_run_conversion.py @@ -0,0 +1,124 @@ +"""test stuff.""" + +import hats +import numpy.testing as npt +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +import hats_import.hipscat_conversion.run_conversion as runner +from hats_import.hipscat_conversion.arguments import ConversionArguments + + +def test_empty_args(): + """Runner should fail with empty arguments""" + with pytest.raises(TypeError, match="ConversionArguments"): + runner.run(None, None) + + +def test_bad_args(): + """Runner should fail with mis-typed arguments""" + args = {"output_artifact_name": "bad_arg_type"} + with pytest.raises(TypeError, match="ConversionArguments"): + runner.run(args, None) + + +@pytest.mark.dask +def test_run_conversion_object( + test_data_dir, + tmp_path, + assert_parquet_file_ids, + dask_client, +): + """Test appropriate metadata is written""" + + input_catalog_dir = test_data_dir / "hipscat" / "small_sky_object_catalog" + + args = ConversionArguments( + input_catalog_path=input_catalog_dir, + output_path=tmp_path, + output_artifact_name="small_sky_object_hats", + progress_bar=False, + ) + runner.run(args, dask_client) + + # Check that the catalog metadata file exists + catalog = hats.read_hats(args.catalog_path) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path + assert int(catalog.catalog_info.__pydantic_extra__["hats_estsize"]) > 0 + + # Check that the catalog parquet file exists and contains correct object IDs + output_file = args.catalog_path / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" + + expected_ids = [*range(700, 831)] + assert_parquet_file_ids(output_file, "id", expected_ids) + + # Check that the schema is correct for leaf parquet and _metadata files + expected_parquet_schema = pa.schema( + [ + pa.field("_healpix_29", pa.int64()), + pa.field("id", pa.int64()), + pa.field("ra", pa.float64()), + pa.field("dec", pa.float64()), + pa.field("ra_error", pa.int64()), + pa.field("dec_error", pa.int64()), + pa.field("Norder", pa.int8()), + pa.field("Dir", pa.int64()), + pa.field("Npix", pa.int64()), + ] + ) + schema = pq.read_metadata(output_file).schema.to_arrow_schema() + assert schema.equals(expected_parquet_schema, check_metadata=False) + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema.to_arrow_schema() + assert schema.equals(expected_parquet_schema, check_metadata=False) + schema = pq.read_metadata(args.catalog_path / "dataset" / "_common_metadata").schema.to_arrow_schema() + assert schema.equals(expected_parquet_schema, check_metadata=False) + + +@pytest.mark.dask +def test_run_conversion_source( + test_data_dir, + tmp_path, + dask_client, +): + """Test appropriate metadata is written""" + + input_catalog_dir = test_data_dir / "hipscat" / "small_sky_source_catalog" + + args = ConversionArguments( + input_catalog_path=input_catalog_dir, + output_path=tmp_path, + output_artifact_name="small_sky_source_hats", + progress_bar=False, + ) + runner.run(args, dask_client) + + # Check that the catalog metadata file exists + catalog = hats.read_hats(args.catalog_path) + assert catalog.on_disk + assert catalog.catalog_path == args.catalog_path + + output_file = args.catalog_path / "dataset" / "Norder=2" / "Dir=0" / "Npix=185.parquet" + + source_columns = [ + "_healpix_29", + "source_id", + "source_ra", + "source_dec", + "mjd", + "mag", + "band", + "object_id", + "object_ra", + "object_dec", + "Norder", + "Dir", + "Npix", + ] + schema = pq.read_metadata(output_file).schema + npt.assert_array_equal(schema.names, source_columns) + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema + npt.assert_array_equal(schema.names, source_columns) + schema = pq.read_metadata(args.catalog_path / "dataset" / "_common_metadata").schema + npt.assert_array_equal(schema.names, source_columns) diff --git a/tests/hipscat_import/index/test_index_argument.py b/tests/hats_import/index/test_index_argument.py similarity index 84% rename from tests/hipscat_import/index/test_index_argument.py rename to tests/hats_import/index/test_index_argument.py index 2d97432e..c8d26c59 100644 --- a/tests/hipscat_import/index/test_index_argument.py +++ b/tests/hats_import/index/test_index_argument.py @@ -4,7 +4,7 @@ import pytest -from hipscat_import.index.arguments import IndexArguments +from hats_import.index.arguments import IndexArguments def test_none(): @@ -83,7 +83,7 @@ def test_column_inclusion_args(tmp_path, small_sky_object_catalog): indexing_column="id", output_path=tmp_path, output_artifact_name="small_sky_object_index", - include_hipscat_index=False, + include_healpix_29=False, include_order_pixel=False, ) _ = IndexArguments( @@ -91,7 +91,7 @@ def test_column_inclusion_args(tmp_path, small_sky_object_catalog): indexing_column="id", output_path=tmp_path, output_artifact_name="small_sky_object_index", - include_hipscat_index=True, + include_healpix_29=True, include_order_pixel=True, ) @@ -100,7 +100,7 @@ def test_column_inclusion_args(tmp_path, small_sky_object_catalog): indexing_column="id", output_path=tmp_path, output_artifact_name="small_sky_object_index", - include_hipscat_index=True, + include_healpix_29=True, include_order_pixel=False, ) _ = IndexArguments( @@ -108,7 +108,7 @@ def test_column_inclusion_args(tmp_path, small_sky_object_catalog): indexing_column="id", output_path=tmp_path, output_artifact_name="small_sky_object_index", - include_hipscat_index=False, + include_healpix_29=False, include_order_pixel=True, ) @@ -119,9 +119,9 @@ def test_extra_columns(tmp_path, small_sky_object_catalog): indexing_column="id", output_path=tmp_path, output_artifact_name="small_sky_object_index", - extra_columns=["_hipscat_index"], + extra_columns=["_healpix_29"], ) - assert args.extra_columns == ["_hipscat_index"] + assert args.extra_columns == ["_healpix_29"] args = IndexArguments( input_catalog_path=small_sky_object_catalog, @@ -154,31 +154,16 @@ def test_compute_partition_size(tmp_path, small_sky_object_catalog): ) -def test_to_catalog_info(small_sky_object_catalog, tmp_path): +def test_to_table_properties(small_sky_object_catalog, tmp_path): """Verify creation of catalog parameters for index to be created.""" args = IndexArguments( input_catalog_path=small_sky_object_catalog, indexing_column="id", output_path=tmp_path, output_artifact_name="small_sky_object_index", - include_hipscat_index=True, + include_healpix_29=True, include_order_pixel=True, ) - catalog_info = args.to_catalog_info(total_rows=10) + catalog_info = args.to_table_properties(total_rows=10) assert catalog_info.catalog_name == args.output_artifact_name assert catalog_info.total_rows == 10 - - -def test_provenance_info(small_sky_object_catalog, tmp_path): - """Verify that provenance info includes index-specific fields.""" - args = IndexArguments( - input_catalog_path=small_sky_object_catalog, - indexing_column="id", - output_path=tmp_path, - output_artifact_name="small_sky_object_index", - include_hipscat_index=True, - include_order_pixel=True, - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "input_catalog_path" in runtime_args diff --git a/tests/hipscat_import/index/test_index_map_reduce.py b/tests/hats_import/index/test_index_map_reduce.py similarity index 82% rename from tests/hipscat_import/index/test_index_map_reduce.py rename to tests/hats_import/index/test_index_map_reduce.py index 025d0f02..a9a77878 100644 --- a/tests/hipscat_import/index/test_index_map_reduce.py +++ b/tests/hats_import/index/test_index_map_reduce.py @@ -5,8 +5,8 @@ import pandas as pd import pytest -import hipscat_import.index.map_reduce as mr -from hipscat_import.index.arguments import IndexArguments +import hats_import.index.map_reduce as mr +from hats_import.index.arguments import IndexArguments @pytest.mark.dask @@ -26,7 +26,7 @@ def test_create_index( ) mr.create_index(args, dask_client) - output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" + output_file = tmp_path / "small_sky_object_index" / "dataset" / "index" / "part.0.parquet" expected_ids = [*range(700, 831)] assert_parquet_file_index(output_file, expected_ids) @@ -34,26 +34,26 @@ def test_create_index( data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal( data_frame.columns, - ["_hipscat_index", "Norder", "Dir", "Npix"], + ["_healpix_29", "Norder", "Dir", "Npix"], ) assert data_frame.index.name == "id" assert (data_frame["Norder"] == 0).all() @pytest.mark.dask -def test_create_index_no_hipscat_index(small_sky_object_catalog, tmp_path, dask_client): - """Create an index for simple object catalog, without the _hipscat_index field.""" +def test_create_index_no_healpix_29(small_sky_object_catalog, tmp_path, dask_client): + """Create an index for simple object catalog, without the _healpix_29 field.""" args = IndexArguments( input_catalog_path=small_sky_object_catalog, indexing_column="id", - include_hipscat_index=False, + include_healpix_29=False, output_path=tmp_path, output_artifact_name="small_sky_object_index", progress_bar=False, ) mr.create_index(args, dask_client) - output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" + output_file = tmp_path / "small_sky_object_index" / "dataset" / "index" / "part.0.parquet" data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal(data_frame.columns, ["Norder", "Dir", "Npix"]) @@ -74,10 +74,10 @@ def test_create_index_no_order_pixel(small_sky_object_catalog, tmp_path, dask_cl ) mr.create_index(args, dask_client) - output_file = tmp_path / "small_sky_object_index" / "index" / "part.0.parquet" + output_file = tmp_path / "small_sky_object_index" / "dataset" / "index" / "part.0.parquet" data_frame = pd.read_parquet(output_file, engine="pyarrow") - npt.assert_array_equal(data_frame.columns, ["_hipscat_index"]) + npt.assert_array_equal(data_frame.columns, ["_healpix_29"]) assert data_frame.index.name == "id" @@ -93,7 +93,7 @@ def test_create_index_source(small_sky_source_catalog, assert_parquet_file_index ) mr.create_index(args, dask_client) - output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" + output_file = tmp_path / "small_sky_source_index" / "dataset" / "index" / "part.0.parquet" expected_ids = [*range(70_000, 87_161)] assert_parquet_file_index(output_file, expected_ids) @@ -101,7 +101,7 @@ def test_create_index_source(small_sky_source_catalog, assert_parquet_file_index data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal( data_frame.columns, - ["_hipscat_index", "Norder", "Dir", "Npix"], + ["_healpix_29", "Norder", "Dir", "Npix"], ) assert data_frame.index.name == "source_id" assert len(data_frame) == 17161 @@ -132,7 +132,7 @@ def test_create_index_with_divisions( ) mr.create_index(args, dask_client) - output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" + output_file = tmp_path / "small_sky_source_index" / "dataset" / "index" / "part.0.parquet" expected_ids = [*range(70_000, 87_161)] assert_parquet_file_index(output_file, expected_ids) @@ -140,7 +140,7 @@ def test_create_index_with_divisions( data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal( data_frame.columns, - ["_hipscat_index", "Norder", "Dir", "Npix"], + ["_healpix_29", "Norder", "Dir", "Npix"], ) assert data_frame.index.name == "source_id" assert len(data_frame) == 17161 @@ -165,7 +165,7 @@ def test_create_index_source_by_object( ) mr.create_index(args, dask_client) - output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" + output_file = tmp_path / "small_sky_source_index" / "dataset" / "index" / "part.0.parquet" expected_ids = np.repeat([*range(700, 831)], 131) assert_parquet_file_index(output_file, expected_ids) @@ -173,7 +173,7 @@ def test_create_index_source_by_object( data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal( data_frame.columns, - ["_hipscat_index", "Norder", "Dir", "Npix"], + ["_healpix_29", "Norder", "Dir", "Npix"], ) assert data_frame.index.name == "object_id" assert len(data_frame) == 17161 @@ -197,7 +197,7 @@ def test_create_index_extra_columns( ) mr.create_index(args, dask_client) - output_file = tmp_path / "small_sky_source_index" / "index" / "part.0.parquet" + output_file = tmp_path / "small_sky_source_index" / "dataset" / "index" / "part.0.parquet" expected_ids = np.repeat([*range(700, 831)], 131) assert_parquet_file_index(output_file, expected_ids) @@ -205,7 +205,7 @@ def test_create_index_extra_columns( data_frame = pd.read_parquet(output_file, engine="pyarrow") npt.assert_array_equal( data_frame.columns, - ["_hipscat_index", "source_ra", "Norder", "Dir", "Npix"], + ["_healpix_29", "source_ra", "Norder", "Dir", "Npix"], ) assert data_frame.index.name == "object_id" assert len(data_frame) == 17161 diff --git a/tests/hipscat_import/index/test_run_index.py b/tests/hats_import/index/test_run_index.py similarity index 73% rename from tests/hipscat_import/index/test_run_index.py rename to tests/hats_import/index/test_run_index.py index a465e557..b45e189d 100644 --- a/tests/hipscat_import/index/test_run_index.py +++ b/tests/hats_import/index/test_run_index.py @@ -1,15 +1,13 @@ """test stuff.""" -import os - import numpy as np import pyarrow as pa import pyarrow.parquet as pq import pytest -from hipscat.catalog.dataset.dataset import Dataset +from hats.catalog.dataset.dataset import Dataset -import hipscat_import.index.run_index as runner -from hipscat_import.index.arguments import IndexArguments +import hats_import.index.run_index as runner +from hats_import.index.arguments import IndexArguments def test_empty_args(): @@ -43,13 +41,13 @@ def test_run_index( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Dataset.read_from_hipscat(args.catalog_path) + catalog = Dataset.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path basic_index_parquet_schema = pa.schema( [ - pa.field("_hipscat_index", pa.uint64()), + pa.field("_healpix_29", pa.int64()), pa.field("Norder", pa.uint8()), pa.field("Dir", pa.uint64()), pa.field("Npix", pa.uint64()), @@ -57,14 +55,14 @@ def test_run_index( ] ) - outfile = os.path.join(args.catalog_path, "index", "part.0.parquet") + outfile = args.catalog_path / "dataset" / "index" / "part.0.parquet" schema = pq.read_metadata(outfile).schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_common_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_common_metadata").schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) @@ -79,6 +77,7 @@ def test_run_index_on_source( args = IndexArguments( input_catalog_path=small_sky_source_catalog, indexing_column="source_id", + extra_columns=["mag", "band"], output_path=tmp_path, output_artifact_name="small_sky_source_id_index", progress_bar=False, @@ -86,13 +85,16 @@ def test_run_index_on_source( runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Dataset.read_from_hipscat(args.catalog_path) + catalog = Dataset.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path + assert catalog.catalog_info.extra_columns == ["mag", "band"] basic_index_parquet_schema = pa.schema( [ - pa.field("_hipscat_index", pa.uint64()), + pa.field("_healpix_29", pa.int64()), + pa.field("mag", pa.float64()), + pa.field("band", pa.large_string()), pa.field("Norder", pa.uint8()), pa.field("Dir", pa.uint64()), pa.field("Npix", pa.uint64()), @@ -100,14 +102,14 @@ def test_run_index_on_source( ] ) - outfile = os.path.join(args.catalog_path, "index", "part.0.parquet") + outfile = args.catalog_path / "dataset" / "index" / "part.0.parquet" schema = pq.read_metadata(outfile).schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_common_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_common_metadata").schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) @@ -125,13 +127,13 @@ def test_run_index_on_source_object_id( indexing_column="object_id", output_path=tmp_path, output_artifact_name="small_sky_source_object_id_index", - include_hipscat_index=False, + include_healpix_29=False, progress_bar=False, ) runner.run(args, dask_client) # Check that the catalog metadata file exists - catalog = Dataset.read_from_hipscat(args.catalog_path) + catalog = Dataset.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path @@ -144,7 +146,7 @@ def test_run_index_on_source_object_id( ] ) - outfile = os.path.join(args.catalog_path, "index", "part.0.parquet") + outfile = args.catalog_path / "dataset" / "index" / "part.0.parquet" schema = pq.read_metadata(outfile).schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) @@ -155,8 +157,8 @@ def test_run_index_on_source_object_id( assert_parquet_file_index(outfile, doubled_up) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_metadata").schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) - schema = pq.read_metadata(os.path.join(args.catalog_path, "_common_metadata")).schema.to_arrow_schema() + schema = pq.read_metadata(args.catalog_path / "dataset" / "_common_metadata").schema.to_arrow_schema() assert schema.equals(basic_index_parquet_schema, check_metadata=False) diff --git a/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py b/tests/hats_import/margin_cache/test_arguments_margin_cache.py similarity index 80% rename from tests/hipscat_import/margin_cache/test_arguments_margin_cache.py rename to tests/hats_import/margin_cache/test_arguments_margin_cache.py index 9729a195..75d20589 100644 --- a/tests/hipscat_import/margin_cache/test_arguments_margin_cache.py +++ b/tests/hats_import/margin_cache/test_arguments_margin_cache.py @@ -1,10 +1,9 @@ """Tests of margin cache generation arguments""" import pytest -from hipscat.io import write_metadata -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats.pixel_math.healpix_pixel import HealpixPixel -from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments +from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments def test_empty_required(tmp_path): @@ -113,7 +112,7 @@ def test_debug_filter_pixel_list(small_sky_source_catalog, tmp_path): ) -def test_to_catalog_info(small_sky_source_catalog, tmp_path): +def test_to_table_properties(small_sky_source_catalog, tmp_path): """Verify creation of catalog info for margin cache to be created.""" args = MarginCacheArguments( margin_threshold=5.0, @@ -122,28 +121,8 @@ def test_to_catalog_info(small_sky_source_catalog, tmp_path): output_artifact_name="catalog_cache", margin_order=4, ) - catalog_info = args.to_catalog_info(total_rows=10) + catalog_info = args.to_table_properties(total_rows=10, highest_order=4, moc_sky_fraction=22 / 7) assert catalog_info.catalog_name == args.output_artifact_name assert catalog_info.total_rows == 10 - assert catalog_info.epoch == "J2000" assert catalog_info.ra_column == "source_ra" assert catalog_info.dec_column == "source_dec" - - -def test_provenance_info(small_sky_source_catalog, tmp_path): - """Verify that provenance info includes margin-cache-specific fields.""" - args = MarginCacheArguments( - margin_threshold=5.0, - input_catalog_path=small_sky_source_catalog, - output_path=tmp_path, - output_artifact_name="catalog_cache", - margin_order=4, - debug_filter_pixel_list=[HealpixPixel(1, 44)], - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "margin_threshold" in runtime_args - - write_metadata.write_provenance_info( - catalog_base_dir=args.catalog_path, dataset_info=args.to_catalog_info(20_000), tool_args=runtime_args - ) diff --git a/tests/hipscat_import/margin_cache/test_margin_cache.py b/tests/hats_import/margin_cache/test_margin_cache.py similarity index 70% rename from tests/hipscat_import/margin_cache/test_margin_cache.py rename to tests/hats_import/margin_cache/test_margin_cache.py index 3b79e25b..1e4768b0 100644 --- a/tests/hipscat_import/margin_cache/test_margin_cache.py +++ b/tests/hats_import/margin_cache/test_margin_cache.py @@ -4,13 +4,12 @@ import numpy.testing as npt import pandas as pd import pytest -from hipscat.catalog import PartitionInfo -from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset -from hipscat.io import paths -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset +from hats.io import paths +from hats.pixel_math.healpix_pixel import HealpixPixel -import hipscat_import.margin_cache.margin_cache as mc -from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments +import hats_import.margin_cache.margin_cache as mc +from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments @pytest.mark.dask(timeout=150) @@ -38,13 +37,13 @@ def test_margin_cache_gen(small_sky_source_catalog, tmp_path, dask_client): assert len(data) == 13 - assert all(data[PartitionInfo.METADATA_ORDER_COLUMN_NAME] == norder) - assert all(data[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] == npix) - assert all(data[PartitionInfo.METADATA_DIR_COLUMN_NAME] == int(npix / 10000) * 10000) + assert all(data[paths.PARTITION_ORDER] == norder) + assert all(data[paths.PARTITION_PIXEL] == npix) + assert all(data[paths.PARTITION_DIR] == int(npix / 10_000) * 10_000) - assert data.dtypes[PartitionInfo.METADATA_ORDER_COLUMN_NAME] == np.uint8 - assert data.dtypes[PartitionInfo.METADATA_DIR_COLUMN_NAME] == np.uint64 - assert data.dtypes[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] == np.uint64 + assert data.dtypes[paths.PARTITION_ORDER] == np.uint8 + assert data.dtypes[paths.PARTITION_PIXEL] == np.uint64 + assert data.dtypes[paths.PARTITION_DIR] == np.uint64 npt.assert_array_equal( data.columns, @@ -66,9 +65,9 @@ def test_margin_cache_gen(small_sky_source_catalog, tmp_path, dask_client): "margin_Npix", ], ) - assert data.index.name == "_hipscat_index" + assert data.index.name == "_healpix_29" - catalog = HealpixDataset.read_from_hipscat(args.catalog_path) + catalog = HealpixDataset.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path diff --git a/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py b/tests/hats_import/margin_cache/test_margin_cache_map_reduce.py similarity index 79% rename from tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py rename to tests/hats_import/margin_cache/test_margin_cache_map_reduce.py index 7e029423..dc610501 100644 --- a/tests/hipscat_import/margin_cache/test_margin_cache_map_reduce.py +++ b/tests/hats_import/margin_cache/test_margin_cache_map_reduce.py @@ -1,15 +1,15 @@ import os -import hipscat.pixel_math.healpix_shim as hp +import hats.pixel_math.healpix_shim as hp import numpy as np import pandas as pd import pytest -from hipscat import pixel_math -from hipscat.io import paths -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats import pixel_math +from hats.io import paths +from hats.pixel_math.healpix_pixel import HealpixPixel -from hipscat_import.margin_cache import margin_cache_map_reduce -from hipscat_import.pipeline_resume_plan import get_pixel_cache_directory +from hats_import.margin_cache import margin_cache_map_reduce +from hats_import.pipeline_resume_plan import get_pixel_cache_directory keep_cols = ["weird_ra", "weird_dec"] @@ -49,7 +49,7 @@ def test_to_pixel_shard_equator(tmp_path, basic_data_shard_df): fine_filtering=True, ) - path = tmp_path / "order_1" / "dir_0" / "pixel_21" / "Norder=1" / "Dir=0" / "Npix=0.parquet" + path = tmp_path / "order_1" / "dir_0" / "pixel_21" / "dataset" / "Norder=1" / "Dir=0" / "Npix=0.parquet" assert os.path.exists(path) @@ -69,7 +69,7 @@ def test_to_pixel_shard_polar(tmp_path, polar_data_shard_df): fine_filtering=True, ) - path = tmp_path / "order_2" / "dir_0" / "pixel_15" / "Norder=2" / "Dir=0" / "Npix=0.parquet" + path = tmp_path / "order_2" / "dir_0" / "pixel_15" / "dataset" / "Norder=2" / "Dir=0" / "Npix=0.parquet" assert os.path.exists(path) @@ -103,9 +103,9 @@ def test_map_pixel_shards_fine(tmp_path, test_data_dir, small_sky_source_catalog intermediate_dir = tmp_path / "intermediate" os.makedirs(intermediate_dir / "mapping") margin_cache_map_reduce.map_pixel_shards( - small_sky_source_catalog / "Norder=1" / "Dir=0" / "Npix=47.parquet", + small_sky_source_catalog / "dataset" / "Norder=1" / "Dir=0" / "Npix=47.parquet", mapping_key="1_47", - original_catalog_metadata=small_sky_source_catalog / "_common_metadata", + original_catalog_metadata=small_sky_source_catalog / "dataset" / "_common_metadata", margin_pair_file=test_data_dir / "margin_pairs" / "small_sky_source_pairs.csv", margin_threshold=3600, output_path=intermediate_dir, @@ -115,12 +115,30 @@ def test_map_pixel_shards_fine(tmp_path, test_data_dir, small_sky_source_catalog fine_filtering=True, ) - path = intermediate_dir / "order_2" / "dir_0" / "pixel_182" / "Norder=1" / "Dir=0" / "Npix=47.parquet" + path = ( + intermediate_dir + / "order_2" + / "dir_0" + / "pixel_182" + / "dataset" + / "Norder=1" + / "Dir=0" + / "Npix=47.parquet" + ) assert os.path.exists(path) res_df = pd.read_parquet(path) assert len(res_df) == 107 - path = intermediate_dir / "order_2" / "dir_0" / "pixel_185" / "Norder=1" / "Dir=0" / "Npix=47.parquet" + path = ( + intermediate_dir + / "order_2" + / "dir_0" + / "pixel_185" + / "dataset" + / "Norder=1" + / "Dir=0" + / "Npix=47.parquet" + ) assert os.path.exists(path) res_df = pd.read_parquet(path) assert len(res_df) == 37 @@ -132,9 +150,9 @@ def test_map_pixel_shards_coarse(tmp_path, test_data_dir, small_sky_source_catal intermediate_dir = tmp_path / "intermediate" os.makedirs(intermediate_dir / "mapping") margin_cache_map_reduce.map_pixel_shards( - small_sky_source_catalog / "Norder=1" / "Dir=0" / "Npix=47.parquet", + small_sky_source_catalog / "dataset" / "Norder=1" / "Dir=0" / "Npix=47.parquet", mapping_key="1_47", - original_catalog_metadata=small_sky_source_catalog / "_common_metadata", + original_catalog_metadata=small_sky_source_catalog / "dataset" / "_common_metadata", margin_pair_file=test_data_dir / "margin_pairs" / "small_sky_source_pairs.csv", margin_threshold=3600, output_path=intermediate_dir, @@ -144,12 +162,30 @@ def test_map_pixel_shards_coarse(tmp_path, test_data_dir, small_sky_source_catal fine_filtering=False, ) - path = intermediate_dir / "order_2" / "dir_0" / "pixel_182" / "Norder=1" / "Dir=0" / "Npix=47.parquet" + path = ( + intermediate_dir + / "order_2" + / "dir_0" + / "pixel_182" + / "dataset" + / "Norder=1" + / "Dir=0" + / "Npix=47.parquet" + ) assert os.path.exists(path) res_df = pd.read_parquet(path) assert len(res_df) == 1386 - path = intermediate_dir / "order_2" / "dir_0" / "pixel_185" / "Norder=1" / "Dir=0" / "Npix=47.parquet" + path = ( + intermediate_dir + / "order_2" + / "dir_0" + / "pixel_185" + / "dataset" + / "Norder=1" + / "Dir=0" + / "Npix=47.parquet" + ) assert os.path.exists(path) res_df = pd.read_parquet(path) assert len(res_df) == 1978 @@ -171,15 +207,15 @@ def test_reduce_margin_shards(tmp_path): norder = np.full(360, 1) ndir = np.full(360, 0) npix = np.full(360, 0) - hipscat_indexes = pixel_math.compute_hipscat_id(ras, dec) + hats_indexes = pixel_math.compute_spatial_index(ras, dec) margin_order = np.full(360, 0) margin_dir = np.full(360, 0) margin_pixels = hp.ang2pix(2**3, ras, dec, lonlat=True, nest=True) test_df = pd.DataFrame( - data=zip(hipscat_indexes, ras, dec, norder, ndir, npix, margin_order, margin_dir, margin_pixels), + data=zip(hats_indexes, ras, dec, norder, ndir, npix, margin_order, margin_dir, margin_pixels), columns=[ - "_hipscat_index", + "_healpix_29", "weird_ra", "weird_dec", "Norder", diff --git a/tests/hipscat_import/margin_cache/test_margin_cache_resume_plan.py b/tests/hats_import/margin_cache/test_margin_cache_resume_plan.py similarity index 93% rename from tests/hipscat_import/margin_cache/test_margin_cache_resume_plan.py rename to tests/hats_import/margin_cache/test_margin_cache_resume_plan.py index 34087ada..f9b3a3a2 100644 --- a/tests/hipscat_import/margin_cache/test_margin_cache_resume_plan.py +++ b/tests/hats_import/margin_cache/test_margin_cache_resume_plan.py @@ -1,10 +1,10 @@ import numpy as np import numpy.testing as npt import pytest -from hipscat.catalog import Catalog +from hats.catalog import Catalog -from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments -from hipscat_import.margin_cache.margin_cache_resume_plan import ( +from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments +from hats_import.margin_cache.margin_cache_resume_plan import ( MarginCachePlan, _find_partition_margin_pixel_pairs, ) @@ -101,7 +101,7 @@ def test_some_reducing_task_failures(small_sky_margin_args, dask_client): def test_partition_margin_pixel_pairs(small_sky_source_catalog): """Ensure partition_margin_pixel_pairs can generate main partition pixels.""" - source_catalog = Catalog.read_from_hipscat(small_sky_source_catalog) + source_catalog = Catalog.read_hats(small_sky_source_catalog) margin_pairs = _find_partition_margin_pixel_pairs(source_catalog.get_healpix_pixels(), 3) expected = np.array([0, 2, 8, 10, 32, 34, 40, 42, 192, 192]) @@ -112,7 +112,7 @@ def test_partition_margin_pixel_pairs(small_sky_source_catalog): def test_partition_margin_pixel_pairs_negative(small_sky_source_catalog): """Ensure partition_margin_pixel_pairs can generate negative tree pixels.""" - source_catalog = Catalog.read_from_hipscat(small_sky_source_catalog) + source_catalog = Catalog.read_hats(small_sky_source_catalog) partition_stats = source_catalog.get_healpix_pixels() negative_pixels = source_catalog.generate_negative_tree_pixels() diff --git a/tests/hipscat_import/margin_cache/test_margin_round_trip.py b/tests/hats_import/margin_cache/test_margin_round_trip.py similarity index 81% rename from tests/hipscat_import/margin_cache/test_margin_round_trip.py rename to tests/hats_import/margin_cache/test_margin_round_trip.py index b557cb77..7fe87a0a 100644 --- a/tests/hipscat_import/margin_cache/test_margin_round_trip.py +++ b/tests/hats_import/margin_cache/test_margin_round_trip.py @@ -5,16 +5,16 @@ import pandas as pd import pytest -from hipscat.catalog.catalog import Catalog -from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset -from hipscat.io import paths -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats.catalog.catalog import Catalog +from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset +from hats.io import paths +from hats.pixel_math.healpix_pixel import HealpixPixel -import hipscat_import.catalog.run_import as runner -import hipscat_import.margin_cache.margin_cache as mc -from hipscat_import.catalog.arguments import ImportArguments -from hipscat_import.catalog.file_readers import CsvReader, get_file_reader -from hipscat_import.margin_cache.margin_cache_arguments import MarginCacheArguments +import hats_import.catalog.run_import as runner +import hats_import.margin_cache.margin_cache as mc +from hats_import.catalog.arguments import ImportArguments +from hats_import.catalog.file_readers import CsvReader, get_file_reader +from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments @pytest.mark.dask(timeout=180) @@ -49,7 +49,7 @@ def test_margin_import_gaia_minimum( runner.run(args, dask_client) # Check that the catalog metadata file exists - Catalog.read_from_hipscat(args.catalog_path) + Catalog.read_hats(args.catalog_path) args = MarginCacheArguments( margin_threshold=180.0, @@ -61,7 +61,7 @@ def test_margin_import_gaia_minimum( ) mc.generate_margin_cache(args, dask_client) - catalog = HealpixDataset.read_from_hipscat(args.catalog_path) + catalog = HealpixDataset.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert len(catalog.get_healpix_pixels()) == 1 @@ -102,7 +102,7 @@ def test_margin_import_mixed_schema_csv( progress_bar=False, ) runner.run(args, dask_client) - catalog = Catalog.read_from_hipscat(args.catalog_path) + catalog = Catalog.read_hats(args.catalog_path) assert len(catalog.get_healpix_pixels()) == 8 args = MarginCacheArguments( @@ -115,7 +115,7 @@ def test_margin_import_mixed_schema_csv( ) mc.generate_margin_cache(args, dask_client) - catalog = HealpixDataset.read_from_hipscat(args.catalog_path) + catalog = HealpixDataset.read_hats(args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == args.catalog_path assert len(catalog.get_healpix_pixels()) == 5 diff --git a/tests/hipscat_import/soap/conftest.py b/tests/hats_import/soap/conftest.py similarity index 92% rename from tests/hipscat_import/soap/conftest.py rename to tests/hats_import/soap/conftest.py index 72161ca8..7a17ee6a 100644 --- a/tests/hipscat_import/soap/conftest.py +++ b/tests/hats_import/soap/conftest.py @@ -1,7 +1,7 @@ import pytest -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats.pixel_math.healpix_pixel import HealpixPixel -from hipscat_import.soap.arguments import SoapArguments +from hats_import.soap.arguments import SoapArguments @pytest.fixture @@ -47,7 +47,6 @@ def catalog_info_data() -> dict: "catalog_name": "test_name", "catalog_type": "object", "total_rows": 10, - "epoch": "J2000", "ra_column": "ra", "dec_column": "dec", } @@ -59,7 +58,6 @@ def source_catalog_info() -> dict: "catalog_name": "test_source", "catalog_type": "source", "total_rows": 100, - "epoch": "J2000", "ra_column": "source_ra", "dec_column": "source_dec", } diff --git a/tests/hipscat_import/soap/test_run_soap.py b/tests/hats_import/soap/test_run_soap.py similarity index 71% rename from tests/hipscat_import/soap/test_run_soap.py rename to tests/hats_import/soap/test_run_soap.py index a3fd5333..72c6cf6f 100644 --- a/tests/hipscat_import/soap/test_run_soap.py +++ b/tests/hats_import/soap/test_run_soap.py @@ -5,10 +5,10 @@ import pyarrow as pa import pyarrow.parquet as pq import pytest -from hipscat.catalog.association_catalog.association_catalog import AssociationCatalog +from hats.catalog.association_catalog.association_catalog import AssociationCatalog -import hipscat_import.soap.run_soap as runner -from hipscat_import.soap.arguments import SoapArguments +import hats_import.soap.run_soap as runner +from hats_import.soap.arguments import SoapArguments def test_empty_args(): @@ -30,7 +30,7 @@ def test_object_to_source(dask_client, small_sky_soap_args): runner.run(small_sky_soap_args, dask_client) ## Check that the association data can be parsed as a valid association catalog. - catalog = AssociationCatalog.read_from_hipscat(small_sky_soap_args.catalog_path) + catalog = AssociationCatalog.read_hats(small_sky_soap_args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == small_sky_soap_args.catalog_path assert len(catalog.get_join_pixels()) == 14 @@ -54,7 +54,7 @@ def test_object_to_self(dask_client, tmp_path, small_sky_object_catalog): runner.run(small_sky_soap_args, dask_client) ## Check that the association data can be parsed as a valid association catalog. - catalog = AssociationCatalog.read_from_hipscat(small_sky_soap_args.catalog_path) + catalog = AssociationCatalog.read_hats(small_sky_soap_args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == small_sky_soap_args.catalog_path assert len(catalog.get_join_pixels()) == 1 @@ -64,7 +64,7 @@ def test_object_to_self(dask_client, tmp_path, small_sky_object_catalog): @pytest.mark.dask def test_object_to_source_with_leaves( - dask_client, tmp_path, small_sky_object_catalog, small_sky_source_catalog, assert_text_file_matches + dask_client, tmp_path, small_sky_object_catalog, small_sky_source_catalog ): """Test creating association between object and source catalogs.""" small_sky_soap_args = SoapArguments( @@ -81,14 +81,16 @@ def test_object_to_source_with_leaves( runner.run(small_sky_soap_args, dask_client) ## Check that the association data can be parsed as a valid association catalog. - catalog = AssociationCatalog.read_from_hipscat(small_sky_soap_args.catalog_path) + catalog = AssociationCatalog.read_hats(small_sky_soap_args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == small_sky_soap_args.catalog_path assert len(catalog.get_join_pixels()) == 14 assert catalog.catalog_info.total_rows == 17161 assert catalog.catalog_info.contains_leaf_files - parquet_file_name = os.path.join(small_sky_soap_args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + parquet_file_name = ( + small_sky_soap_args.catalog_path / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" + ) assert os.path.exists(parquet_file_name), f"file not found [{parquet_file_name}]" parquet_file = pq.ParquetFile(parquet_file_name) @@ -110,28 +112,10 @@ def test_object_to_source_with_leaves( ) assert parquet_file.metadata.schema.to_arrow_schema().equals(exepcted_schema, check_metadata=False) - expected_lines = [ - "{", - ' "catalog_name": "small_sky_object_to_source",', - ' "catalog_type": "association",', - ' "total_rows": 17161,', - r' "primary_catalog": ".*small_sky_object_catalog",', - ' "primary_column": "id",', - ' "primary_column_association": "object_id",', - r' "join_catalog": ".*small_sky_source_catalog",', - ' "join_column": "object_id",', - ' "join_column_association": "source_id",', - ' "contains_leaf_files": true', - "}", - ] - - metadata_filename = os.path.join(small_sky_soap_args.catalog_path, "catalog_info.json") - assert_text_file_matches(expected_lines, metadata_filename) - @pytest.mark.dask def test_object_to_source_with_leaves_drop_duplicates( - dask_client, tmp_path, small_sky_object_catalog, small_sky_source_catalog, assert_text_file_matches + dask_client, tmp_path, small_sky_object_catalog, small_sky_source_catalog ): """Test creating association between object and source catalogs.""" small_sky_soap_args = SoapArguments( @@ -148,14 +132,16 @@ def test_object_to_source_with_leaves_drop_duplicates( runner.run(small_sky_soap_args, dask_client) ## Check that the association data can be parsed as a valid association catalog. - catalog = AssociationCatalog.read_from_hipscat(small_sky_soap_args.catalog_path) + catalog = AssociationCatalog.read_hats(small_sky_soap_args.catalog_path) assert catalog.on_disk assert catalog.catalog_path == small_sky_soap_args.catalog_path assert len(catalog.get_join_pixels()) == 14 assert catalog.catalog_info.total_rows == 148 assert catalog.catalog_info.contains_leaf_files - parquet_file_name = os.path.join(small_sky_soap_args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + parquet_file_name = ( + small_sky_soap_args.catalog_path / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" + ) assert os.path.exists(parquet_file_name), f"file not found [{parquet_file_name}]" parquet_file = pq.ParquetFile(parquet_file_name) @@ -176,21 +162,3 @@ def test_object_to_source_with_leaves_drop_duplicates( ] ) assert parquet_file.metadata.schema.to_arrow_schema().equals(exepcted_schema, check_metadata=False) - - expected_lines = [ - "{", - ' "catalog_name": "small_sky_object_to_source",', - ' "catalog_type": "association",', - ' "total_rows": 148,', - r' "primary_catalog": ".*small_sky_object_catalog",', - ' "primary_column": "id",', - ' "primary_column_association": "object_id",', - r' "join_catalog": ".*small_sky_source_catalog",', - ' "join_column": "object_id",', - ' "join_column_association": "source_id",', - ' "contains_leaf_files": true', - "}", - ] - - metadata_filename = os.path.join(small_sky_soap_args.catalog_path, "catalog_info.json") - assert_text_file_matches(expected_lines, metadata_filename) diff --git a/tests/hipscat_import/soap/test_soap_arguments.py b/tests/hats_import/soap/test_soap_arguments.py similarity index 98% rename from tests/hipscat_import/soap/test_soap_arguments.py rename to tests/hats_import/soap/test_soap_arguments.py index d40f5ed3..b398d8ab 100644 --- a/tests/hipscat_import/soap/test_soap_arguments.py +++ b/tests/hats_import/soap/test_soap_arguments.py @@ -1,6 +1,6 @@ import pytest -from hipscat_import.soap.arguments import SoapArguments +from hats_import.soap.arguments import SoapArguments def test_none(): diff --git a/tests/hipscat_import/soap/test_soap_map_reduce.py b/tests/hats_import/soap/test_soap_map_reduce.py similarity index 95% rename from tests/hipscat_import/soap/test_soap_map_reduce.py rename to tests/hats_import/soap/test_soap_map_reduce.py index c605eb29..a8d77952 100644 --- a/tests/hipscat_import/soap/test_soap_map_reduce.py +++ b/tests/hats_import/soap/test_soap_map_reduce.py @@ -8,10 +8,10 @@ import pandas as pd import pyarrow.parquet as pq import pytest -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats.pixel_math.healpix_pixel import HealpixPixel -from hipscat_import.soap.arguments import SoapArguments -from hipscat_import.soap.map_reduce import combine_partial_results, count_joins, reduce_joins +from hats_import.soap.arguments import SoapArguments +from hats_import.soap.map_reduce import combine_partial_results, count_joins, reduce_joins def test_count_joins(small_sky_soap_args, tmp_path, small_sky_soap_maps): @@ -140,7 +140,9 @@ def test_reduce_joins(small_sky_soap_args, soap_intermediate_dir, small_sky_soap reduce_joins(small_sky_soap_args, HealpixPixel(0, 11), object_key="0_11") - parquet_file_name = os.path.join(small_sky_soap_args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet") + parquet_file_name = ( + small_sky_soap_args.catalog_path / "dataset" / "Norder=0" / "Dir=0" / "Npix=11.parquet" + ) assert os.path.exists(parquet_file_name), f"file not found [{parquet_file_name}]" parquet_file = pq.ParquetFile(parquet_file_name) diff --git a/tests/hipscat_import/soap/test_soap_resume_plan.py b/tests/hats_import/soap/test_soap_resume_plan.py similarity index 90% rename from tests/hipscat_import/soap/test_soap_resume_plan.py rename to tests/hats_import/soap/test_soap_resume_plan.py index 29e56e58..f9b839da 100644 --- a/tests/hipscat_import/soap/test_soap_resume_plan.py +++ b/tests/hats_import/soap/test_soap_resume_plan.py @@ -4,17 +4,16 @@ from pathlib import Path import pytest -from hipscat.catalog import Catalog -from hipscat.catalog.catalog_info import CatalogInfo -from hipscat.pixel_math.healpix_pixel import HealpixPixel +from hats.catalog import Catalog, TableProperties +from hats.pixel_math.healpix_pixel import HealpixPixel -from hipscat_import.soap.resume_plan import SoapPlan, source_to_object_map +from hats_import.soap.resume_plan import SoapPlan, source_to_object_map def test_source_to_object_map(small_sky_object_catalog, small_sky_source_catalog, small_sky_soap_maps): """Test creating plan map for object and source catalogs.""" - object_catalog = Catalog.read_from_hipscat(small_sky_object_catalog) - source_catalog = Catalog.read_from_hipscat(small_sky_source_catalog) + object_catalog = Catalog.read_hats(small_sky_object_catalog) + source_catalog = Catalog.read_hats(small_sky_source_catalog) source_to_object = source_to_object_map(object_catalog, source_catalog) assert source_to_object == small_sky_soap_maps @@ -41,8 +40,8 @@ def test_object_to_source_map(small_sky_object_catalog, small_sky_source_catalog ] } ## Oh, we're so silly! - object_catalog = Catalog.read_from_hipscat(small_sky_source_catalog) - source_catalog = Catalog.read_from_hipscat(small_sky_object_catalog) + object_catalog = Catalog.read_hats(small_sky_source_catalog) + source_catalog = Catalog.read_hats(small_sky_object_catalog) source_to_object = source_to_object_map(object_catalog, source_catalog) assert source_to_object == expected @@ -51,7 +50,7 @@ def test_object_to_source_map(small_sky_object_catalog, small_sky_source_catalog def test_mismatch_order_map(catalog_info_data, source_catalog_info): """Create some catalogs that will exercise edge case behavior of map-generation.""" object_catalog = Catalog( - CatalogInfo(**catalog_info_data), + TableProperties(**catalog_info_data), [ HealpixPixel(1, 16), HealpixPixel(2, 68), @@ -60,7 +59,7 @@ def test_mismatch_order_map(catalog_info_data, source_catalog_info): HealpixPixel(2, 71), ], ) - source_catalog = Catalog(CatalogInfo(**source_catalog_info), [HealpixPixel(1, 16)]) + source_catalog = Catalog(TableProperties(**source_catalog_info), [HealpixPixel(1, 16)]) expected = { HealpixPixel(1, 16): [ diff --git a/tests/hats_import/test_packaging.py b/tests/hats_import/test_packaging.py new file mode 100644 index 00000000..c19c92f7 --- /dev/null +++ b/tests/hats_import/test_packaging.py @@ -0,0 +1,6 @@ +import hats_import + + +def test_hats_import_version(): + """Check to see that we can get the hats-import version""" + assert hats_import.__version__ is not None diff --git a/tests/hipscat_import/test_pipeline_resume_plan.py b/tests/hats_import/test_pipeline_resume_plan.py similarity index 98% rename from tests/hipscat_import/test_pipeline_resume_plan.py rename to tests/hats_import/test_pipeline_resume_plan.py index 7334d6a2..b1bfeac7 100644 --- a/tests/hipscat_import/test_pipeline_resume_plan.py +++ b/tests/hats_import/test_pipeline_resume_plan.py @@ -5,7 +5,7 @@ import numpy.testing as npt import pytest -from hipscat_import.pipeline_resume_plan import PipelineResumePlan, get_formatted_stage_name +from hats_import.pipeline_resume_plan import PipelineResumePlan, get_formatted_stage_name def test_done_key(tmp_path): diff --git a/tests/hipscat_import/test_runtime_arguments.py b/tests/hats_import/test_runtime_arguments.py similarity index 70% rename from tests/hipscat_import/test_runtime_arguments.py rename to tests/hats_import/test_runtime_arguments.py index cea801cc..2bb16874 100644 --- a/tests/hipscat_import/test_runtime_arguments.py +++ b/tests/hats_import/test_runtime_arguments.py @@ -2,7 +2,7 @@ import pytest -from hipscat_import.runtime_arguments import RuntimeArguments +from hats_import.runtime_arguments import RuntimeArguments # pylint: disable=protected-access @@ -123,15 +123,48 @@ def test_dask_args(tmp_path): ) -def test_provenance_info(tmp_path): - """Verify that provenance info ONLY includes general runtime fields.""" +def test_extra_property_dict(test_data_dir): args = RuntimeArguments( - output_artifact_name="catalog", - output_path=tmp_path, - tmp_dir=tmp_path, - dask_tmp=tmp_path, - progress_bar=False, + output_artifact_name="small_sky_source_catalog", + output_path=test_data_dir, + ) + + properties = args.extra_property_dict() + assert list(properties.keys()) == [ + "hats_builder", + "hats_creation_date", + "hats_estsize", + "hats_release_date", + "hats_version", + ] + + # Most values are dynamic, but these are some safe assumptions. + assert properties["hats_builder"].startswith("hats") + assert properties["hats_creation_date"].startswith("20") + assert properties["hats_estsize"] > 1_000 + assert properties["hats_release_date"].startswith("20") + assert properties["hats_version"].startswith("v") + + args = RuntimeArguments( + output_artifact_name="small_sky_source_catalog", + output_path=test_data_dir, + addl_hats_properties={"foo": "bar"}, ) - runtime_args = args.provenance_info()["runtime_args"] - assert len(runtime_args) == 9 + properties = args.extra_property_dict() + assert list(properties.keys()) == [ + "hats_builder", + "hats_creation_date", + "hats_estsize", + "hats_release_date", + "hats_version", + "foo", + ] + + # Most values are dynamic, but these are some safe assumptions. + assert properties["hats_builder"].startswith("hats") + assert properties["hats_creation_date"].startswith("20") + assert properties["hats_estsize"] > 1_000 + assert properties["hats_release_date"].startswith("20") + assert properties["hats_version"].startswith("v") + assert properties["foo"] == "bar" diff --git a/tests/hipscat_import/verification/test_run_verification.py b/tests/hats_import/verification/test_run_verification.py similarity index 85% rename from tests/hipscat_import/verification/test_run_verification.py rename to tests/hats_import/verification/test_run_verification.py index c672af7f..33be14f5 100644 --- a/tests/hipscat_import/verification/test_run_verification.py +++ b/tests/hats_import/verification/test_run_verification.py @@ -1,7 +1,7 @@ import pytest -import hipscat_import.verification.run_verification as runner -from hipscat_import.verification.arguments import VerificationArguments +import hats_import.verification.run_verification as runner +from hats_import.verification.arguments import VerificationArguments def test_bad_args(): diff --git a/tests/hipscat_import/verification/test_verification_arguments.py b/tests/hats_import/verification/test_verification_arguments.py similarity index 76% rename from tests/hipscat_import/verification/test_verification_arguments.py rename to tests/hats_import/verification/test_verification_arguments.py index 8ebd6c81..dd2203ba 100644 --- a/tests/hipscat_import/verification/test_verification_arguments.py +++ b/tests/hats_import/verification/test_verification_arguments.py @@ -1,9 +1,9 @@ """Tests of argument validation""" import pytest -from hipscat.catalog import Catalog +from hats.catalog import Catalog -from hipscat_import.verification.arguments import VerificationArguments +from hats_import.verification.arguments import VerificationArguments def test_none(): @@ -53,9 +53,13 @@ def test_good_paths(tmp_path, small_sky_object_catalog): assert str(args.tmp_path).startswith(tmp_path_str) +@pytest.mark.timeout(5) def test_catalog_object(tmp_path, small_sky_object_catalog): - """Required arguments are provided, and paths are found.""" - small_sky_catalog_object = Catalog.read_from_hipscat(catalog_path=small_sky_object_catalog) + """Required arguments are provided, and paths are found. + + NB: This is currently the last test in alpha-order, and may require additional + time to teardown fixtures.""" + small_sky_catalog_object = Catalog.read_hats(catalog_path=small_sky_object_catalog) tmp_path_str = str(tmp_path) args = VerificationArguments( input_catalog=small_sky_catalog_object, @@ -65,18 +69,3 @@ def test_catalog_object(tmp_path, small_sky_object_catalog): assert args.input_catalog_path == small_sky_object_catalog assert str(args.output_path) == tmp_path_str assert str(args.tmp_path).startswith(tmp_path_str) - - -@pytest.mark.timeout(5) -def test_provenance_info(small_sky_object_catalog, tmp_path): - """Verify that provenance info includes verification-specific fields. - NB: This is currently the last test in alpha-order, and may require additional - time to teardown fixtures.""" - args = VerificationArguments( - input_catalog_path=small_sky_object_catalog, - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "input_catalog_path" in runtime_args diff --git a/tests/hipscat_import/data/indexed_files/csv_list_double_1_of_2.txt b/tests/hipscat_import/data/indexed_files/csv_list_double_1_of_2.txt deleted file mode 100644 index 8e9c9d54..00000000 --- a/tests/hipscat_import/data/indexed_files/csv_list_double_1_of_2.txt +++ /dev/null @@ -1,3 +0,0 @@ -tests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv -tests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv - diff --git a/tests/hipscat_import/data/indexed_files/csv_list_double_2_of_2.txt b/tests/hipscat_import/data/indexed_files/csv_list_double_2_of_2.txt deleted file mode 100644 index 352c08ea..00000000 --- a/tests/hipscat_import/data/indexed_files/csv_list_double_2_of_2.txt +++ /dev/null @@ -1,3 +0,0 @@ -tests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv -tests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv -tests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv \ No newline at end of file diff --git a/tests/hipscat_import/data/indexed_files/csv_list_single.txt b/tests/hipscat_import/data/indexed_files/csv_list_single.txt deleted file mode 100644 index 04817f83..00000000 --- a/tests/hipscat_import/data/indexed_files/csv_list_single.txt +++ /dev/null @@ -1,6 +0,0 @@ -tests/hipscat_import/data/small_sky_parts/catalog_00_of_05.csv -tests/hipscat_import/data/small_sky_parts/catalog_01_of_05.csv -tests/hipscat_import/data/small_sky_parts/catalog_02_of_05.csv -tests/hipscat_import/data/small_sky_parts/catalog_03_of_05.csv -tests/hipscat_import/data/small_sky_parts/catalog_04_of_05.csv - diff --git a/tests/hipscat_import/data/indexed_files/parquet_list_single.txt b/tests/hipscat_import/data/indexed_files/parquet_list_single.txt deleted file mode 100644 index 63e5b84f..00000000 --- a/tests/hipscat_import/data/indexed_files/parquet_list_single.txt +++ /dev/null @@ -1,5 +0,0 @@ -tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_0_0.parquet -tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_1_0.parquet -tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_2_0.parquet -tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_3_0.parquet -tests/hipscat_import/data/parquet_shards/order_0/dir_0/pixel_11/shard_4_0.parquet diff --git a/tests/hipscat_import/data/small_sky_object_catalog/point_map.fits b/tests/hipscat_import/data/small_sky_object_catalog/point_map.fits deleted file mode 100644 index 1971966f..00000000 Binary files a/tests/hipscat_import/data/small_sky_object_catalog/point_map.fits and /dev/null differ diff --git a/tests/hipscat_import/data/test_formats/hipscat_index.csv b/tests/hipscat_import/data/test_formats/hipscat_index.csv deleted file mode 100644 index 0b4f8a78..00000000 --- a/tests/hipscat_import/data/test_formats/hipscat_index.csv +++ /dev/null @@ -1,132 +0,0 @@ -id,_hipscat_index,magnitude,nobs -707,12749688880727326720,22.13496609,264 -792,12751184493818150912,6.487240283,395 -811,12753202806647685120,23.7801059,268 -723,12753202806647685121,22.86223173,426 -826,12770681119980912640,18.01813779,338 -750,12771980657148559360,24.78617356,364 -771,12776409575968473088,23.11024818,389 -734,12782714789977653248,21.40031147,479 -738,12786706826733289472,13.78825467,457 -772,12786894563780329472,4.188779415,336 -776,12788339839317573632,12.40395764,212 -733,12797951905556856832,8.970635074,217 -804,12801026705158307840,2.651958506,451 -747,12818067795442925568,8.884322517,288 -739,12823504327528153088,4.29767576,332 -816,12842381331509805056,21.90158694,413 -703,12842473731565551616,24.08464986,247 -794,12855054043935932416,12.84586391,410 -735,12856781556059996160,11.25435057,386 -797,12859878138972209152,3.39664171,308 -815,12866984851890241536,8.873824597,347 -748,12882093266048122880,0.7007125911,221 -716,12886291525662670848,10.69933855,239 -807,12886577464536465408,21.07926395,466 -768,12887770713741590528,7.656387426,472 -729,12888117478487490560,20.04342103,474 -810,12888375204127965184,10.51822722,445 -718,12890425758039670784,19.06826935,322 -818,12897705201133158400,13.12773551,380 -766,12901304742075957248,9.016078942,281 -730,12904011555938500608,7.923479941,463 -758,12924400840801779712,19.96407461,202 -780,12924737222707511296,12.77423302,250 -775,12926803467124604928,4.732638481,432 -760,12927513300782022656,13.65187931,331 -795,12935235931912273920,25.40947285,364 -822,12946238438616596480,12.71703295,471 -736,12947523513744359424,11.43843929,444 -801,12949977409238597632,20.47232202,315 -830,12951015418364952576,15.73427632,417 -817,12957936896993918976,12.05402411,346 -787,12958541318065225728,0.173058202,318 -812,12980498864409673728,0.06359345768,208 -722,12985050869937471488,25.60136077,381 -731,13025270726448381952,6.948013224,310 -720,13031060802264629248,12.14407631,360 -823,13040468461170458624,13.46433211,487 -742,13055884976753475584,15.54236222,480 -719,13093160001097170944,12.6330677,435 -710,13094378277252890624,18.02127743,316 -726,13095317624672223232,5.502478521,462 -744,13097779065304121344,0.8044727764,209 -813,13100157308065808384,17.49730373,271 -757,13109184215138697216,6.925286532,483 -821,13114993892334239744,6.87906273,454 -762,13117165557772189696,13.02548232,345 -728,13122077940282032128,12.44919001,380 -781,13123208770404483072,12.0115697,417 -704,13130546552927944704,24.91128218,430 -751,13135578070553460736,24.91242751,372 -724,13158407025211736064,16.88580167,421 -808,13164283224702058496,24.24020062,490 -784,13186894729939255296,13.33239374,225 -732,13187453677775880192,13.31989657,258 -745,13189921792761790464,25.75526883,405 -786,13202401744484564992,14.17373488,371 -705,13203103043639312384,21.0010083,269 -779,13211086588563423232,1.907256896,492 -761,13235029212974284800,4.441992632,207 -828,13239388759557931008,1.459992705,354 -803,13250788433850269696,20.88998098,233 -788,13263647230914461696,16.89976725,334 -700,13272631885323829248,17.50204436,497 -793,13277499429092327424,11.03349796,373 -749,13283409463257071616,16.3051455,268 -805,13284984179453329408,18.38673631,351 -773,13293316792777703424,7.637464207,252 -774,13300970211545972736,16.00879042,469 -712,13316869903572008960,14.92672305,453 -759,13319655515505033216,9.060606183,393 -820,13325709382806142976,8.722036752,393 -789,13326118614579806208,17.74075663,292 -711,13335640766354030592,18.97132992,356 -802,13335856080517857280,22.33402335,295 -701,13341394068685455360,1.17742214,263 -727,13347311673342427136,19.59159969,353 -717,13348003826582421504,13.54006347,202 -753,13351146793404989440,8.367684293,285 -769,13358998609274601472,23.5473658,325 -725,13359333484913491968,8.571007577,432 -827,13362536511002640384,7.048240895,284 -777,13364612928339181568,15.69992175,478 -764,13368388511275679744,2.729025506,371 -785,13369482380335644672,24.20554709,285 -709,13369514156621824000,19.52493845,473 -713,13374210622061805568,25.88972528,336 -800,13382429402164363264,22.99430092,428 -706,13384601479449411584,18.74524258,287 -755,13387360701694083072,20.52555018,377 -741,13387360701694083073,4.868075987,278 -714,13388334615593222144,12.67035091,334 -763,13389212170495983616,5.129770839,441 -708,13389509163101454336,6.418179256,477 -765,13392589952663945216,1.538183169,482 -740,13393588426222075904,3.147345677,340 -783,13425161974698737664,11.01365058,230 -790,13462800185222496256,0.8847339055,456 -809,13465233373970563072,25.57341722,253 -715,13467391906581315584,17.7427978,477 -782,13477206946360590336,14.58925323,231 -752,13488986123334057984,10.62019163,320 -746,13520476867982786560,24.33242981,297 -770,13521835979425447936,25.87566821,339 -756,13552942781667737600,16.94841816,342 -798,13553697461939208192,12.41856805,315 -778,13557123557418336256,19.34416403,297 -829,13557377060258709504,24.98975232,446 -819,13557816572940124160,18.2879184,367 -814,13560168899495854080,18.10171084,444 -721,13560933976658411520,5.911152016,423 -737,13561582046530240512,8.714697049,310 -799,13563711661973438464,3.390950561,267 -825,13564690156971098112,4.077103157,389 -796,13565852277582856192,21.03770104,418 -754,13588709332114997248,20.86431444,408 -806,13590818251897569280,13.50630832,308 -791,13591216801265483776,16.70345827,263 -824,13596001812279721984,11.11028334,243 -702,13598131468743213056,17.0456881,429 -767,13601023174257934336,12.99882524,476 -743,13696722494273093632,19.0979479,428 \ No newline at end of file diff --git a/tests/hipscat_import/data/test_formats/hipscat_index.parquet b/tests/hipscat_import/data/test_formats/hipscat_index.parquet deleted file mode 100644 index 44bdf663..00000000 Binary files a/tests/hipscat_import/data/test_formats/hipscat_index.parquet and /dev/null differ diff --git a/tests/hipscat_import/test_packaging.py b/tests/hipscat_import/test_packaging.py deleted file mode 100644 index 877c0ddf..00000000 --- a/tests/hipscat_import/test_packaging.py +++ /dev/null @@ -1,6 +0,0 @@ -import hipscat_import - - -def test_hipscat_import_version(): - """Check to see that we can get the hipscat-import version""" - assert hipscat_import.__version__ is not None