Skip to content

Commit

Permalink
Pre-discover all leaf files for dataset creation. (#501)
Browse files Browse the repository at this point in the history
  • Loading branch information
delucchi-cmu authored Mar 4, 2025
1 parent 98205be commit f00cd7a
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/hats_import/soap/run_soap.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Create pixel-to-pixel association between object and source catalogs.
Methods in this file set up a dask pipeline using futures.
Methods in this file set up a dask pipeline using futures.
The actual logic of the map reduce is in the `map_reduce.py` file.
"""

Expand Down
11 changes: 10 additions & 1 deletion src/hats_import/verification/run_verification.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dataclasses import dataclass, field
from time import perf_counter
from typing import Literal
from urllib.parse import unquote

import hats
import hats.io.paths
Expand Down Expand Up @@ -117,12 +118,20 @@ def from_args(cls, args: VerificationArguments) -> "Verifier":
-------
Verifier
"""

args.output_path.mkdir(exist_ok=True, parents=True)

if args.verbose:
print("Loading dataset and schema.")
parquet_fs = args.input_catalog_path.fs
files_ds = pds.dataset(args.input_dataset_path.path, filesystem=parquet_fs)

## Fetch all sub-URLs that could contain hats leaf files.
all_files = []
for child in args.input_dataset_path.rglob("Norder*/**/*"):
if not child.is_dir():
all_files.append(unquote(child.path))

files_ds = pds.dataset(all_files, filesystem=parquet_fs)
metadata_ds = pds.parquet_dataset(
hats.io.paths.get_parquet_metadata_pointer(args.input_catalog_path), filesystem=parquet_fs
)
Expand Down
2 changes: 1 addition & 1 deletion tests/hats_import/catalog/test_run_round_trip.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Test end-to-end execution of pipeline with different formats and configurations.
Please add a brief description in the docstring of the features or specific
Please add a brief description in the docstring of the features or specific
regression the test case is exercising.
"""

Expand Down
2 changes: 1 addition & 1 deletion tests/hats_import/margin_cache/test_margin_round_trip.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Tests that create a new primary object catalog, and immediately
"""Tests that create a new primary object catalog, and immediately
create a margin cache based on the primary catalog."""

import os
Expand Down

0 comments on commit f00cd7a

Please sign in to comment.