Skip to content

Commit

Permalink
feat(bulk-ingest): Add --ignored Flag to Exclude Specific Files and D…
Browse files Browse the repository at this point in the history
…irectories During Ingestion (zylon-ai#1432)
  • Loading branch information
smbrine authored Feb 7, 2024
1 parent 24fae66 commit b178b51
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 7 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
.venv
.env
venv

settings-me.yaml

Expand Down
17 changes: 17 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,20 @@ wipe:

setup:
poetry run python scripts/setup

list:
@echo "Available commands:"
@echo " test : Run tests using pytest"
@echo " test-coverage : Run tests with coverage report"
@echo " black : Check code format with black"
@echo " ruff : Check code with ruff"
@echo " format : Format code with black and ruff"
@echo " mypy : Run mypy for type checking"
@echo " check : Run format and mypy commands"
@echo " run : Run the application"
@echo " dev-windows : Run the application in development mode on Windows"
@echo " dev : Run the application in development mode"
@echo " api-docs : Generate API documentation"
@echo " ingest : Ingest data using specified script"
@echo " wipe : Wipe data using specified script"
@echo " setup : Setup the application"
29 changes: 22 additions & 7 deletions scripts/ingest_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,20 @@ def __init__(self, ingest_service: IngestService) -> None:

self._files_under_root_folder: list[Path] = list()

def _find_all_files_in_folder(self, root_path: Path) -> None:
def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None:
"""Search all files under the root folder recursively.
Count them at the same time
"""
for file_path in root_path.iterdir():
if file_path.is_file():
if file_path.is_file() and file_path.name not in ignored:
self.total_documents += 1
self._files_under_root_folder.append(file_path)
elif file_path.is_dir():
self._find_all_files_in_folder(file_path)
elif file_path.is_dir() and file_path.name not in ignored:
self._find_all_files_in_folder(file_path, ignored)

def ingest_folder(self, folder_path: Path) -> None:
def ingest_folder(self, folder_path: Path, ignored: list[str]) -> None:
# Count total documents before ingestion
self._find_all_files_in_folder(folder_path)
self._find_all_files_in_folder(folder_path, ignored)
self._ingest_all(self._files_under_root_folder)

def _ingest_all(self, files_to_ingest: list[Path]) -> None:
Expand Down Expand Up @@ -64,12 +64,19 @@ def _do_ingest_one(self, changed_path: Path) -> None:
action=argparse.BooleanOptionalAction,
default=False,
)
parser.add_argument(
"--ignored",
nargs="*",
help="List of files/directories to ignore",
default=[],
)
parser.add_argument(
"--log-file",
help="Optional path to a log file. If provided, logs will be written to this file.",
type=str,
default=None,
)

args = parser.parse_args()

# Set up logging to a file if a path is provided
Expand All @@ -91,9 +98,17 @@ def _do_ingest_one(self, changed_path: Path) -> None:

ingest_service = global_injector.get(IngestService)
worker = LocalIngestWorker(ingest_service)
worker.ingest_folder(root_path)
worker.ingest_folder(root_path, args.ignored)

if args.ignored:
logger.info(f"Skipping following files and directories: {args.ignored}")

if args.watch:
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
directories_to_watch = [
dir
for dir in root_path.iterdir()
if dir.is_dir() and dir.name not in args.ignored
]
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
watcher.start()

0 comments on commit b178b51

Please sign in to comment.