diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 8ffcb035..3b73dcd7 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -165,3 +165,16 @@ def get_all_media(self) -> List[Media]: def __str__(self) -> str: return self.__repr__() + + + @staticmethod + def choose_most_complete(results: List[Metadata]) -> Metadata: + # returns the most complete result from a list of results + # prioritizes results with more media, then more metadata + if len(results) == 0: return None + if len(results) == 1: return results[0] + most_complete = results[0] + for r in results[1:]: + if len(r.media) > len(most_complete.media): most_complete = r + elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r + return most_complete \ No newline at end of file diff --git a/src/auto_archiver/databases/api_db.py b/src/auto_archiver/databases/api_db.py index b33b146f..92ae6bfd 100644 --- a/src/auto_archiver/databases/api_db.py +++ b/src/auto_archiver/databases/api_db.py @@ -35,15 +35,15 @@ def fetch(self, item: Metadata) -> Union[Metadata, bool]: """ query the database for the existence of this item""" if not self.allow_rearchive: return - params = {"url": item.get_url(), "limit": 1} + params = {"url": item.get_url(), "limit": 15} headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) if response.status_code == 200: if len(response.json()): - logger.success(f"API returned a previously archived instance: {response.json()}") - # TODO: can we do better than just returning the most recent result? - return Metadata.from_dict(response.json()[0]["result"]) + logger.success(f"API returned {len(response.json())} previously archived instance(s)") + fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()] + return Metadata.choose_most_complete(fetched_metadata) else: logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") return False diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py index 828e0196..14111857 100644 --- a/src/auto_archiver/version.py +++ b/src/auto_archiver/version.py @@ -3,7 +3,7 @@ _MINOR = "7" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "4" +_PATCH = "5" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""