Skip to content

Commit

Permalink
adjustments per initial testing with gs:// and .flac handling
Browse files Browse the repository at this point in the history
  • Loading branch information
carueda committed Jan 23, 2024
1 parent 97e89f6 commit 798bc67
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 11 deletions.
20 changes: 20 additions & 0 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,26 @@ main-mb05 *more_args="":
# --max-segments=5 \
# --output-dir=/Volumes/PAM_Analysis/pypam-space/test_output \

# Exercise program with `gs://` URIs
main-google date='20200101' *more_args="--max-segments=5":
#!/usr/bin/env bash
WS=noaa-passive-bioacoustic_nrs_11_2019-2021
mkdir -p $WS/DOWNLOADS
mkdir -p $WS/OUTPUT
PYTHONPATH=. EXCLUDE_LOG_TIME=yes \
python src/main.py \
--date={{date}} \
--gs \
--json-base-dir=$WS \
--voltage-multiplier=1 \
--sensitivity-flat-value=1 \
--output-dir=$WS/OUTPUT \
--output-prefix=NRS_ \
--download-dir=$WS/DOWNLOADS \
--retain-downloaded-files \
{{more_args}}
# --assume-downloaded-files \
# Basic test for cloud processing
main-cloud-basic-test max_segments="1" date="20220902":
#!/usr/bin/env bash
Expand Down
14 changes: 11 additions & 3 deletions src/file_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def __init__(
audio_base_dir: Optional[str],
audio_path_map_prefix: str,
audio_path_prefix: str,
s3_client: Optional[BaseClient],
download_dir: Optional[str],
assume_downloaded_files: bool,
s3_client: Optional[BaseClient] = None,
gs_client: Optional[GsClient] = None,
):
self.logger = logger

Expand All @@ -47,6 +48,7 @@ def __init__(
self.audio_base_dir = audio_base_dir
self.audio_path_prefix = audio_path_prefix
self.s3_client = s3_client
self.gs_client = gs_client
self.download_dir: str = download_dir if download_dir else "."
self.assume_downloaded_files = assume_downloaded_files

Expand Down Expand Up @@ -83,6 +85,7 @@ def _get_sound_filename(self) -> Optional[str]:
self.download_dir,
self.assume_downloaded_files,
self.s3_client,
self.gs_client,
)

# otherwise assuming local file, so we only inspect the `path` attribute:
Expand All @@ -99,7 +102,9 @@ def remove_downloaded_file(self):
if not pathlib.Path(self.sound_filename).exists():
return

if self.s3_client is None or self.parsed_uri.scheme != "s3":
if (
self.s3_client is None and self.gs_client is None
) or self.parsed_uri.scheme not in ("s3", "gs"):
self.logger.debug(f"No file download involved for {self.uri=}")
return

Expand Down Expand Up @@ -294,6 +299,7 @@ def get_local_filename(self, uri: Optional[str]) -> Optional[str]:
self.download_dir,
self.assume_downloaded_files,
self.s3_client,
self.gs_client,
)

return parsed_uri.path
Expand Down Expand Up @@ -337,6 +343,7 @@ def _get_json_s3(self, parsed_uri: ParseResult) -> Optional[str]:
self.download_dir,
self.assume_downloaded_files,
self.s3_client,
self.gs_client,
)
if local_filename is None:
return None
Expand Down Expand Up @@ -467,9 +474,10 @@ def _get_sound_status(self, uri: str) -> SoundStatus:
audio_base_dir=self.audio_base_dir,
audio_path_map_prefix=self.audio_path_map_prefix,
audio_path_prefix=self.audio_path_prefix,
s3_client=self.s3_client,
download_dir=self.download_dir,
assume_downloaded_files=self.assume_downloaded_files,
s3_client=self.s3_client,
gs_client=self.gs_client,
)
self.sound_cache[uri] = ss
else:
Expand Down
22 changes: 15 additions & 7 deletions src/json_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def get_intersecting_entries(
Gets the list of intersecting entries for the UTC "start minute"
given by (year, month, day, at_hour, at_minute).
:param logger:
The logger to be used
:param json_entries:
JSON entries for the day
:param year:
Expand All @@ -90,6 +92,12 @@ def get_intersecting_entries(
:return:
The list of intersecting entries
"""
# for logging purposes:
time_spec = (
f"year={year} month={month} day={day} at_hour={at_hour} at_minute={at_minute}"
)
logger.debug(f"get_intersecting_entries: {time_spec} {len(json_entries)=}")

# the requested start minute as datetime:
dt = datetime(year, month, day, at_hour, at_minute, tzinfo=timezone.utc)
# the start of the requested start minute in seconds:
Expand All @@ -116,14 +124,14 @@ def get_intersecting_entries(
)
tot_duration_secs += duration_secs

# for logging purposes:
time_spec = (
f"year={year} month={month} day={day} at_hour={at_hour} at_minute={at_minute}"
)

if logger.is_enabled_for(logging.DEBUG):
warning = 0 == len(intersecting_entries)
if warning or logger.is_enabled_for(logging.DEBUG):
uris = [i.entry.uri for i in intersecting_entries]
uris_str = "\n ".join([f"[{e}] {uri}" for e, uri in enumerate(uris)])
logger.debug(f"{time_spec}: intersection uris({len(uris)}):\n {uris_str}")
msg = f"{time_spec}: intersection uris({len(uris)}):\n {uris_str}"
if warning:
logger.warn(msg)
else:
logger.debug(msg)

return intersecting_entries
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def main(opts):
s3_client = boto3.client("s3", **kwargs)

gs_client = None
if opts.s3:
if opts.gs:
# pylint: disable=import-outside-toplevel
from google.cloud.storage import Client as GsClient

Expand Down

0 comments on commit 798bc67

Please sign in to comment.