diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3b18a33..cea110c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,6 +8,12 @@ env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} +defaults: + run: + # GitHub Actions run without a TTY device. This is a workaround to get one, + # based on https://github.com/actions/runner/issues/241#issuecomment-2019042651 + shell: 'script --return --quiet --log-out /dev/null --command "bash -e {0}"' + jobs: build-and-push-image: runs-on: ubuntu-latest @@ -20,6 +26,14 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Run tests + run: | + docker run \ + -v ./src:/app \ + -v ./pdf:/app/pdf \ + $(docker build -q ./src) \ + bash src/test/example.sh + - name: Log in to the Container registry uses: docker/login-action@v3.1.0 with: diff --git a/src/main.py b/src/main.py index fbc7a69..c1f93a2 100644 --- a/src/main.py +++ b/src/main.py @@ -8,7 +8,6 @@ from pathlib import Path import pymupdf -from joblib import Parallel, delayed from natsort import natsorted, ns from PIL import Image @@ -34,7 +33,7 @@ def predict(base: Path, input_file: Path, args: list[str]) -> None: [ "bash", "-c", - f"ocrmypdf --jobs 1 {' '.join(args)} {input_file} {output_file}", + f"ocrmypdf {' '.join(args)} {input_file} {output_file}", ], check=True, ) @@ -45,50 +44,77 @@ def predict(base: Path, input_file: Path, args: list[str]) -> None: pass +def cleanup(root: str, files: list[str]) -> None: + """ + Removes empty directory + + Args: + root (str): The root directory + files (list[str]): The list of files + """ + if not files: + try: + os.rmdir(root) + except Exception: + pass + + +def merge(base: Path, root: str, files: list[str]) -> None: + """ + Merges the PDFs in the list + + Args: + base (Path): The base directory + root (str): The root directory + files (list[str]): The list of files + """ + proot = Path(root) + if proot == base / "done": + return + + pdf_list = [ + pymupdf.open(proot / file) for file in files if file.lower().endswith(".pdf") + ] + if not pdf_list: + return + + merged = pymupdf.open() + for pdf in natsorted(pdf_list, key=lambda x: x.name, alg=ns.IGNORECASE): + merged.insert_pdf(pdf) + + merged.save(Path(root + ".pdf"), garbage=4, deflate=True) + merged.close() + + for pdf in pdf_list: + pdf.close() + + if __name__ == "__main__": pdfs = Path(sys.argv[1] if len(sys.argv) > 1 else ".") pdfs.mkdir(exist_ok=True, parents=True) (pdfs / "todo").mkdir(exist_ok=True, parents=True) (pdfs / "done").mkdir(exist_ok=True, parents=True) - Parallel(n_jobs=-1)( - delayed(predict)( - pdfs, - Path(root) / file, - sys.argv[2:] if len(sys.argv) > 2 else ["--rotate-pages", "--deskew", "--skip-text", "--invalidate-digital-signatures", "--clean"], - ) - for root, _, files in os.walk(pdfs / "todo") - for file in files - ) + for root, _, files in os.walk(pdfs / "todo"): + for file in files: + predict( + pdfs, + Path(root) / file, + ( + sys.argv[2:] + if len(sys.argv) > 2 + else [ + "--rotate-pages", + "--deskew", + "--skip-text", + "--invalidate-digital-signatures", + "--clean", + ] + ), + ) - # Remove empty directories for root, _, files in os.walk(pdfs / "todo"): - if not files: - try: - os.rmdir(root) - except Exception: - pass + cleanup(root, files) - # Merge PDFs for root, _, files in os.walk(pdfs / "done"): - proot = Path(root) - if proot == pdfs / "done": - continue - - pdf_list = [ - pymupdf.open(proot / file) - for file in files - if file.lower().endswith(".pdf") - ] - if not pdf_list: - continue - - merged = pymupdf.open() - for pdf in natsorted(pdf_list, key=lambda x: x.name, alg=ns.IGNORECASE): - merged.insert_pdf(pdf) - - merged.save(Path(root + ".pdf"), garbage=4, deflate=True) - merged.close() - - for pdf in pdf_list: - pdf.close() + merge(pdfs, root, files) diff --git a/src/predict.sh b/src/predict.sh index a4c1026..16e1521 100644 --- a/src/predict.sh +++ b/src/predict.sh @@ -13,7 +13,6 @@ if ! apt_install "$langs"; then fi [ -d venv ] || python3 -m venv venv -export OMP_THREAD_LIMIT=1 if [[ -e venv/bin/pip3 ]]; then source venv/bin/activate diff --git a/src/test/example.sh b/src/test/example.sh new file mode 100644 index 0000000..ea803b2 --- /dev/null +++ b/src/test/example.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e + +black_box_single_pdf() { + \cp -f pdf/todo/example.pdf.bak pdf/todo/example.pdf + bash src/predict.sh pdf + [ ! -f pdf/todo/example.pdf ] || exit 1 + [ -f pdf/done/example.pdf ] || exit 1 + rm -f pdf/done/example.pdf +} + +black_box_single_pdf +echo "All tests passed!"