Skip to content

Commit

Permalink
Fix issue with PDFs without extension
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Oct 7, 2024
1 parent d4574a5 commit e6be908
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 40 deletions.
4 changes: 2 additions & 2 deletions docker-compose-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ services:

queue-processor-pdf-layout:
container_name: "queue-processor-pdf-layout"
entrypoint: [ "python", "-m", "src.QueueProcessor" ]
entrypoint: [ "python", "-m", "src.start_queue_processor" ]
init: true
restart: unless-stopped
build:
Expand All @@ -43,7 +43,7 @@ services:
worker-pdf-layout:
container_name: "worker-pdf-layout"
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.11
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.15
init: true
restart: unless-stopped
ports:
Expand Down
15 changes: 4 additions & 11 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
services:
api-pdf-layout:
container_name: "api-pdf-layout"
entrypoint: [ "gunicorn", "-w", "2", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051", "--timeout", "300" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051", "--timeout", "300" ]
init: true
restart: unless-stopped
build:
Expand All @@ -20,7 +20,7 @@ services:

queue-processor-pdf-layout-gpu:
container_name: "queue-processor-pdf-layout-gpu"
entrypoint: [ "python", "-m", "src.QueueProcessor" ]
entrypoint: [ "python", "-m", "src.start_queue_processor" ]
init: true
restart: unless-stopped
build:
Expand All @@ -38,19 +38,12 @@ services:
- mongo-pdf-layout

worker-pdf-layout-gpu:
container_name: "worker-pdf-layout-gpu"
container_name: "worker-pdf-layout-no-gpu"
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.11
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.15
init: true
restart: unless-stopped
network_mode: host
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
volumes:
- data:/app/xmls

Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
git+https://github.com/huridocs/pdf-document-layout-analysis@d6cbcc4891391fd9f2fc577c9cef6f9c8f7d9e6f
git+https://github.com/huridocs/queue-processor@26c9413ac4fd950ace4ee542d6734e6959e10ea4
git+https://github.com/huridocs/pdf-document-layout-analysis@7cde0c113a5a312decd8a95fd759b48399ef4fb5
git+https://github.com/huridocs/queue-processor@d30bf31e614694cf65f7117cd28cabf4afedfe55
graypy==2.1.0
PyYAML==6.0.1
pymongo==4.8.0
Expand Down
2 changes: 1 addition & 1 deletion src/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
import graypy

QUEUES_NAMES = os.environ.get("QUEUES_NAMES", "segmentation")
QUEUES_NAMES = os.environ.get("QUEUES_NAMES", "segmentation development_segmentation")

SERVICE_HOST = os.environ.get("SERVICE_HOST", "http://127.0.0.1")
SERVICE_PORT = os.environ.get("SERVICE_PORT", "5051")
Expand Down
41 changes: 21 additions & 20 deletions src/delete_queues.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
from redis import exceptions
from rsmq import RedisSMQ

from configuration import TASK_QUEUE_NAME, RESULTS_QUEUE_NAME
from configuration import QUEUES_NAMES

REDIS_HOST = "127.0.0.1"
REDIS_PORT = "6379"


def delete_queues():
try:
queue = RedisSMQ(
host=REDIS_HOST,
port=REDIS_PORT,
qname=TASK_QUEUE_NAME,
quiet=False,
)

queue.deleteQueue().exceptions(False).execute()
queue.createQueue().exceptions(False).execute()

queue = RedisSMQ(
host=REDIS_HOST,
port=REDIS_PORT,
qname=RESULTS_QUEUE_NAME,
quiet=False,
)

queue.deleteQueue().exceptions(False).execute()
queue.createQueue().exceptions(False).execute()
for queue_name in QUEUES_NAMES.split():
queue = RedisSMQ(
host=REDIS_HOST,
port=REDIS_PORT,
qname=queue_name + "_tasks",
quiet=False,
)

queue.deleteQueue().exceptions(False).execute()
queue.createQueue().exceptions(False).execute()

queue = RedisSMQ(
host=REDIS_HOST,
port=REDIS_PORT,
qname=queue_name + "_results",
quiet=False,
)

queue.deleteQueue().exceptions(False).execute()
queue.createQueue().exceptions(False).execute()

print("Queues properly deleted")

Expand Down
4 changes: 3 additions & 1 deletion src/extract_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@


def get_xml_name(task: Task) -> str:
return f"{task.tenant}__{task.params.filename.lower().replace('.pdf', '.xml')}"
xml_file_name = f"{task.tenant}__{task.params.filename.lower().replace('.pdf', '.xml')}"
xml_file_name = xml_file_name if xml_file_name.endswith(".xml") else f"{xml_file_name}.xml"
return xml_file_name


def extract_segments(task: Task, xml_file_name: str = "") -> ExtractionData:
Expand Down
3 changes: 1 addition & 2 deletions src/get_xml.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import os
from os.path import join
from pathlib import Path

from configuration import DATA_PATH


def get_xml(xml_file_name: str) -> str:
xml_file_path = Path(join(DATA_PATH, xml_file_name))
xml_file_path = Path(DATA_PATH, xml_file_name)

with open(xml_file_path, mode="r") as file:
content = file.read()
Expand Down
2 changes: 1 addition & 1 deletion src/QueueProcessor.py → src/start_queue_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ def process_task(task):
pass

queues_names = QUEUES_NAMES.split(" ")
queue_processor = QueueProcessor(REDIS_HOST, REDIS_PORT, queues_names, service_logger)
queue_processor = QueueProcessor(REDIS_HOST, REDIS_PORT, queues_names, service_logger, 7)
queue_processor.start(process)

0 comments on commit e6be908

Please sign in to comment.