feat: Add continuous mode with --sleep-seconds (#192)

janw · Oct 27, 2024 · 02ff00e · 02ff00e
1 parent 43b2c95
commit 02ff00e
Show file tree

Hide file tree

Showing 18 changed files with 399 additions and 168 deletions.
diff --git a/README.md b/README.md
@@ -72,6 +72,25 @@ Feeds can also be "fetched" from a local file:
 podcast-archiver -f file:/Users/janw/downloaded_feed.xml
 ```
 
+#### Continuous mode
+
+When the `--sleep-seconds` option is set to a non-zero value, Podcast Archiver operates in continuous mode. After successfully populating the archive, it will not terminate but rather sleep for the given number of seconds until it refreshes the feeds again and downloads episodes that have been published in the meantime.
+
+If no new episodes have been published, no download attempts will be made, and the archiver will go to sleep again. This mode of operation is ideal to be run in a containerized setup, for example using [docker compose](https://docs.docker.com/compose/install/):
+
+```yaml
+services:
+  podcast-archiver:
+    restart: always
+    image: ghcr.io/janw/podcast-archiver
+    volumes:
+      - ./archive:/archive
+    command:
+      - --sleep-seconds=3600  # sleep for 1 hour between updates
+      - --feed=https://feeds.feedburner.com/TheAnthropoceneReviewed
+      - --feed=https://feeds.megaphone.fm/heavyweight-spot
+```
+
 ### Changing the filename format
 
 Podcast Archiver has a `--filename-template` option that allows you to change the particular naming scheme of the archive. The default value for `--filename-template`. is shown in `podcast-archiver --help`, as well as all the available variables. The basic ones are:

diff --git a/cspell.config.yaml b/cspell.config.yaml
@@ -44,6 +44,7 @@ words:
   - PYTHONUNBUFFERED
   - pyyaml
   - rprint
+  - signum
   - subdirs
   - tini
   - tmpl

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,17 @@
+services:
+  podcast-archiver:
+    restart: always
+    image: ghcr.io/janw/podcast-archiver:v1
+    build:
+      context: .
+      dockerfile: Dockerfile
+      cache_from:
+        - ghcr.io/janw/podcast-archiver:edge
+        - ghcr.io/janw/podcast-archiver:latest
+    volumes:
+      - ./archive:/archive
+    command:
+      - --sleep-seconds=3600
+      - --ignore-database
+      - --feed=https://feeds.feedburner.com/TheAnthropoceneReviewed
+      - --feed=https://feeds.megaphone.fm/heavyweight-spot
diff --git a/podcast_archiver/base.py b/podcast_archiver/base.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import signal
+import sys
 import xml.etree.ElementTree as etree
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from podcast_archiver.logging import logger, rprint
 from podcast_archiver.processor import FeedProcessor
@@ -31,9 +33,15 @@ def __init__(self, settings: Settings):
             self.add_from_opml(opml)
 
     def register_cleanup(self, ctx: click.RichContext) -> None:
-        @ctx.call_on_close
-        def _cleanup() -> None:
+        def _cleanup(signum: int, *args: Any) -> None:
+            logger.debug("Signal %s received", signum)
+            rprint("[error]Terminating.[/]")
             self.processor.shutdown()
+            ctx.close()
+            sys.exit(0)
+
+        signal.signal(signal.SIGINT, _cleanup)
+        signal.signal(signal.SIGTERM, _cleanup)
 
     def add_feed(self, feed: Path | str) -> None:
         new_feeds = [feed] if isinstance(feed, str) else feed.read_text().strip().splitlines()

diff --git a/podcast_archiver/cli.py b/podcast_archiver/cli.py
@@ -3,18 +3,18 @@
 import os
 import pathlib
 import stat
+import time
 from os import getenv
 from typing import TYPE_CHECKING, Any
 
 import rich_click as click
-from rich import get_console
 
 from podcast_archiver import __version__ as version
 from podcast_archiver import constants
 from podcast_archiver.base import PodcastArchiver
 from podcast_archiver.config import Settings, in_ci
 from podcast_archiver.exceptions import InvalidSettings
-from podcast_archiver.logging import configure_logging
+from podcast_archiver.logging import configure_logging, rprint
 
 if TYPE_CHECKING:
     from click.shell_completion import CompletionItem
@@ -49,6 +49,7 @@
                 "--update",
                 "--max-episodes",
                 "--ignore-database",
+                "--sleep",
             ],
         },
     ]
@@ -215,6 +216,7 @@ def generate_default_config(ctx: click.Context, param: click.Parameter, value: b
     "-v",
     "--verbose",
     count=True,
+    metavar="",
     show_envvar=True,
     help=Settings.model_fields["verbose"].description,
 )
@@ -281,10 +283,16 @@ def generate_default_config(ctx: click.Context, param: click.Parameter, value: b
     show_envvar=True,
     help=Settings.model_fields["ignore_database"].description,
 )
+@click.option(
+    "--sleep-seconds",
+    type=int,
+    default=0,
+    show_envvar=True,
+    help=Settings.model_fields["sleep_seconds"].description,
+)
 @click.pass_context
 def main(ctx: click.RichContext, /, **kwargs: Any) -> int:
-    get_console().quiet = kwargs["quiet"]
-    configure_logging(kwargs["verbose"])
+    configure_logging(kwargs["verbose"], kwargs["quiet"])
     try:
         settings = Settings.load_from_dict(kwargs)
 
@@ -296,6 +304,10 @@ def main(ctx: click.RichContext, /, **kwargs: Any) -> int:
         pa = PodcastArchiver(settings=settings)
         pa.register_cleanup(ctx)
         pa.run()
+        while settings.sleep_seconds > 0:
+            rprint(f"Sleeping for {settings.sleep_seconds} seconds.")
+            time.sleep(settings.sleep_seconds)
+            pa.run()
     except InvalidSettings as exc:
         raise click.BadParameter(f"Invalid settings: {exc}") from exc
     except KeyboardInterrupt as exc:  # pragma: no cover

diff --git a/podcast_archiver/config.py b/podcast_archiver/config.py
@@ -87,7 +87,11 @@ class Settings(BaseModel):
 
     verbose: int = Field(
         default=0,
-        description="Increase the level of verbosity while downloading.",
+        description=(
+            "Increase the level of verbosity while downloading. Can be passed multiple times. Increased verbosity and "
+            "non-interactive execution (in a cronjob, docker compose, etc.) will disable progress bars. "
+            "Non-interactive execution also always raises the verbosity unless --quiet is passed."
+        ),
     )
 
     slugify_paths: bool = Field(
@@ -136,6 +140,14 @@ class Settings(BaseModel):
         ),
     )
 
+    sleep_seconds: int = Field(
+        default=0,
+        description=(
+            f"Run {constants.PROG_NAME} continuously. Set to a non-zero number of seconds to sleep after all available "
+            "episodes have been downloaded. Otherwise the application exits after all downloads have been completed."
+        ),
+    )
+
     config: FilePath | None = Field(
         default=None,
         exclude=True,

diff --git a/podcast_archiver/constants.py b/podcast_archiver/constants.py
@@ -15,6 +15,8 @@
 
 MAX_TITLE_LENGTH = 96
 
+
+DEFAULT_DATETIME_FORMAT = "%Y-%m-%d"
 DEFAULT_ARCHIVE_DIRECTORY = pathlib.Path(".")
 DEFAULT_FILENAME_TEMPLATE = "{show.title}/{episode.published_time:%Y-%m-%d} - {episode.title}.{ext}"
 DEFAULT_CONCURRENCY = 4

diff --git a/podcast_archiver/download.py b/podcast_archiver/download.py
@@ -1,15 +1,13 @@
 from __future__ import annotations
 
-from contextlib import nullcontext
+from contextlib import contextmanager
 from threading import Event
-from typing import IO, TYPE_CHECKING, NoReturn
-
-from tqdm import tqdm
-from tqdm.contrib.logging import logging_redirect_tqdm
+from typing import IO, TYPE_CHECKING, Generator
 
 from podcast_archiver import constants
 from podcast_archiver.enums import DownloadResult
-from podcast_archiver.logging import logger
+from podcast_archiver.exceptions import NotCompleted
+from podcast_archiver.logging import logger, wrapped_tqdm
 from podcast_archiver.session import session
 from podcast_archiver.types import EpisodeResult
 from podcast_archiver.utils import atomic_write
@@ -28,38 +26,31 @@ class DownloadJob:
     target: Path
     stop_event: Event
 
-    _debug_partial: bool
+    _max_download_bytes: int | None = None
     _write_info_json: bool
-    _no_progress: bool
 
     def __init__(
         self,
         episode: Episode,
         *,
         target: Path,
-        debug_partial: bool = False,
+        max_download_bytes: int | None = None,
         write_info_json: bool = False,
-        no_progress: bool = False,
         stop_event: Event | None = None,
     ) -> None:
         self.episode = episode
         self.target = target
-        self._debug_partial = debug_partial
+        self._max_download_bytes = max_download_bytes
         self._write_info_json = write_info_json
-        self._no_progress = no_progress
         self.stop_event = stop_event or Event()
 
-    def __repr__(self) -> str:
-        return f"EpisodeDownload({self})"
-
-    def __str__(self) -> str:
-        return str(self.episode)
-
     def __call__(self) -> EpisodeResult:
         try:
             return self.run()
+        except NotCompleted:
+            return EpisodeResult(self.episode, DownloadResult.ABORTED)
         except Exception as exc:
-            logger.error(f"Download failed: {exc}")
+            logger.error("Download failed: %s; %s", self.episode, exc)
             logger.debug("Exception while downloading", exc_info=exc)
             return EpisodeResult(self.episode, DownloadResult.FAILED)
 
@@ -68,58 +59,44 @@ def run(self) -> EpisodeResult:
             return EpisodeResult(self.episode, DownloadResult.ALREADY_EXISTS)
 
         self.target.parent.mkdir(parents=True, exist_ok=True)
-        self.write_info_json()
-
-        response = session.get(
-            self.episode.enclosure.href,
-            stream=True,
-            allow_redirects=True,
-        )
-        response.raise_for_status()
-        total_size = int(response.headers.get("content-length", "0"))
-        with (
-            logging_redirect_tqdm() if not self._no_progress else nullcontext(),
-            tqdm(
-                desc=f"{self.episode.title} ({self.episode.published_time:%Y-%m-%d})",
-                total=total_size,
-                unit_scale=True,
-                unit="B",
-                disable=self._no_progress,
-            ) as progresser,
-        ):
-            with atomic_write(self.target, mode="wb") as fp:
-                receive_complete = self.receive_data(fp, response, progresser=progresser)
-
-            if not receive_complete:
-                self.target.unlink(missing_ok=True)
-                return EpisodeResult(self.episode, DownloadResult.ABORTED)
+        logger.info("Downloading: %s", self.episode)
+        response = session.get_and_raise(self.episode.enclosure.href, stream=True)
+        with self.write_info_json(), atomic_write(self.target, mode="wb") as fp:
+            self.receive_data(fp, response)
 
-            logger.info("Completed download of %s", self.target)
+        logger.info("Completed: %s", self.episode)
         return EpisodeResult(self.episode, DownloadResult.COMPLETED_SUCCESSFULLY)
 
     @property
     def infojsonfile(self) -> Path:
         return self.target.with_suffix(".info.json")
 
-    def receive_data(self, fp: IO[str], response: Response, progresser: tqdm[NoReturn]) -> bool:
+    def receive_data(self, fp: IO[bytes], response: Response) -> None:
+        total_size = int(response.headers.get("content-length", "0"))
         total_written = 0
-        for chunk in response.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
-            written = fp.write(chunk)
-            total_written += written
-            progresser.update(written)
-
-            if self._debug_partial and total_written >= constants.DEBUG_PARTIAL_SIZE:
-                logger.debug("Partial download completed.")
-                return True
-            if self.stop_event.is_set():
-                logger.debug("Stop event is set, bailing.")
-                return False
+        max_bytes = self._max_download_bytes
+        for chunk in wrapped_tqdm(
+            response.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE),
+            desc=str(self.episode),
+            total=total_size,
+        ):
+            total_written += fp.write(chunk)
 
-        return True
+            if max_bytes and total_written >= max_bytes:
+                fp.truncate(max_bytes)
+                logger.debug("Partial download of first %s bytes completed.", max_bytes)
+                return
+
+            if self.stop_event.is_set():
+                logger.debug("Stop event is set, bailing on %s.", self.episode)
+                raise NotCompleted
 
-    def write_info_json(self) -> None:
+    @contextmanager
+    def write_info_json(self) -> Generator[None, None, None]:
         if not self._write_info_json:
+            yield
             return
-        logger.info("Writing episode metadata to %s", self.infojsonfile.name)
         with atomic_write(self.infojsonfile) as fp:
             fp.write(self.episode.model_dump_json(indent=2) + "\n")
+            yield
+        logger.debug("Wrote episode metadata to %s", self.infojsonfile.name)
diff --git a/podcast_archiver/enums.py b/podcast_archiver/enums.py
@@ -7,9 +7,9 @@ def __str__(self) -> str:
 
 
 class QueueCompletionType(StrEnum):
-    COMPLETED = "Archived all episodes."
-    FOUND_EXISTING = "Archive is up to date."
-    MAX_EPISODES = "Maximum episode count reached."
+    COMPLETED = "Archived all episodes"
+    FOUND_EXISTING = "Archive is up to date"
+    MAX_EPISODES = "Maximum episode count reached"
 
 
 class DownloadResult(StrEnum):

diff --git a/podcast_archiver/exceptions.py b/podcast_archiver/exceptions.py
@@ -1,6 +1,11 @@
-from typing import Any
+from __future__ import annotations
 
-import pydantic_core
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import pydantic_core
+
+    from podcast_archiver.models import FeedInfo
 
 
 class PodcastArchiverException(Exception):
@@ -27,3 +32,16 @@ def __str__(self) -> str:
 
 class MissingDownloadUrl(ValueError):
     pass
+
+
+class NotCompleted(RuntimeError):
+    pass
+
+
+class NotModified(PodcastArchiverException):
+    info: FeedInfo
+    last_modified: str | None = None
+
+    def __init__(self, info: FeedInfo, *args: object) -> None:
+        super().__init__(*args)
+        self.info = info