diff --git a/.gitignore b/.gitignore index 9c0b87d7b..ee4031b13 100644 --- a/.gitignore +++ b/.gitignore @@ -309,4 +309,4 @@ tests/example2.* # Client data paperqa/clients/client_data/retractions.csv -rag-qa-benchmarking/ \ No newline at end of file +rag-qa-benchmarking/ diff --git a/paperqa/agents/search.py b/paperqa/agents/search.py index 5504fa755..da60dd2b8 100644 --- a/paperqa/agents/search.py +++ b/paperqa/agents/search.py @@ -237,7 +237,7 @@ async def searcher(self) -> Searcher: index.reload() self._searcher = index.searcher() return self._searcher - + @property async def writer(self) -> IndexWriter: if not self._writer: @@ -277,7 +277,7 @@ async def filecheck(self, filename: str, body: str | None = None) -> bool: async def mark_failed_document(self, path: str | os.PathLike) -> None: (await self.index_files)[str(path)] = FAILED_DOCUMENT_ADD_ID self.changed = True - + async def release_lock(self) -> None: """Remove any stale lock files from the index metadata directory.""" index_meta_dir = pathlib.Path(str(await self.index_filename)) @@ -286,9 +286,7 @@ async def release_lock(self) -> None: lock_file.unlink() logger.info(f"Removed stale lock file: {lock_file}") except Exception as ex: - logger.exception( - f"Could not remove stale lock file: {lock_file}: {ex}" - ) + logger.exception(f"Could not remove stale lock file: {lock_file}: {ex}") async def add_document( self, @@ -345,7 +343,7 @@ async def _add_document() -> None: f" within {lock_acquisition_max_retries} attempts." ) raise - + async def commit(self) -> None: """Commit all pending changes to the index.""" if self._writer: @@ -354,7 +352,6 @@ async def commit(self) -> None: self._searcher = None self._writer = None - @staticmethod @retry( stop=stop_after_attempt(1000), @@ -484,6 +481,7 @@ async def maybe_get_manifest( FAILED_DOCUMENT_ADD_ID = "ERROR" + def get_manifest_kwargs( manifest: dict[str, Any], manifest_fallback_location: str, file_location: str ) -> dict[str, Any]: @@ -495,7 +493,10 @@ def get_manifest_kwargs( return manifest_entry.model_dump() return {} + processed = 0 + + async def process_file( rel_file_path: anyio.Path, search_index: SearchIndex, @@ -505,7 +506,7 @@ async def process_file( progress_bar_update: Callable[[], Any] | None = None, ) -> None: global processed - + abs_file_path = ( pathlib.Path(settings.agent.index.paper_directory).absolute() / rel_file_path ) @@ -521,7 +522,9 @@ async def process_file( if not await search_index.filecheck(filename=file_location): logger.info(f"New file to index: {file_location}...") - manifest_kwargs = get_manifest_kwargs(manifest, manifest_fallback_location, file_location) + manifest_kwargs = get_manifest_kwargs( + manifest, manifest_fallback_location, file_location + ) tmp_docs = Docs() try: @@ -560,13 +563,13 @@ async def process_file( }, document=tmp_docs, ) - + processed += 1 if processed == settings.agent.index.concurrency: await search_index.save_index() logger.info(f"Saved index after processing {processed} files.") processed = 0 - + logger.info(f"Complete ({title}).") # Update progress bar for either a new or previously indexed file diff --git a/paperqa/readers.py b/paperqa/readers.py index 259f2ea50..85a2004a3 100644 --- a/paperqa/readers.py +++ b/paperqa/readers.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import os from math import ceil from pathlib import Path @@ -20,8 +21,6 @@ from paperqa.utils import ImpossibleParsingError from paperqa.version import __version__ as pqa_version -import asyncio - def parse_pdf_to_pages( path: str | os.PathLike, page_size_limit: int | None = None diff --git a/tests/test_agents.py b/tests/test_agents.py index 0f92c3d4b..19c9ccb98 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -161,7 +161,7 @@ async def crashing_aadd(*args, **kwargs) -> str | None: ) as mock_aadd, ): index = await get_directory_index(settings=agent_test_settings) - + assert len(await index.index_files) == num_source_files assert ( mock_aadd.await_count != num_source_files