Skip to content

Commit e6be908

Browse files
committed
Fix issue with PDFs without extension
1 parent d4574a5 commit e6be908

8 files changed

+35
-40
lines changed

docker-compose-test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ services:
2323

2424
queue-processor-pdf-layout:
2525
container_name: "queue-processor-pdf-layout"
26-
entrypoint: [ "python", "-m", "src.QueueProcessor" ]
26+
entrypoint: [ "python", "-m", "src.start_queue_processor" ]
2727
init: true
2828
restart: unless-stopped
2929
build:
@@ -43,7 +43,7 @@ services:
4343
worker-pdf-layout:
4444
container_name: "worker-pdf-layout"
4545
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
46-
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.11
46+
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.15
4747
init: true
4848
restart: unless-stopped
4949
ports:

docker-compose.yml

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
services:
22
api-pdf-layout:
33
container_name: "api-pdf-layout"
4-
entrypoint: [ "gunicorn", "-w", "2", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051", "--timeout", "300" ]
4+
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051", "--timeout", "300" ]
55
init: true
66
restart: unless-stopped
77
build:
@@ -20,7 +20,7 @@ services:
2020

2121
queue-processor-pdf-layout-gpu:
2222
container_name: "queue-processor-pdf-layout-gpu"
23-
entrypoint: [ "python", "-m", "src.QueueProcessor" ]
23+
entrypoint: [ "python", "-m", "src.start_queue_processor" ]
2424
init: true
2525
restart: unless-stopped
2626
build:
@@ -38,19 +38,12 @@ services:
3838
- mongo-pdf-layout
3939

4040
worker-pdf-layout-gpu:
41-
container_name: "worker-pdf-layout-gpu"
41+
container_name: "worker-pdf-layout-no-gpu"
4242
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5060", "--timeout", "10000"]
43-
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.11
43+
image: ghcr.io/huridocs/pdf-document-layout-analysis:0.0.15
4444
init: true
4545
restart: unless-stopped
4646
network_mode: host
47-
deploy:
48-
resources:
49-
reservations:
50-
devices:
51-
- driver: nvidia
52-
count: 1
53-
capabilities: [ gpu ]
5447
volumes:
5548
- data:/app/xmls
5649

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
git+https://github.com/huridocs/pdf-document-layout-analysis@d6cbcc4891391fd9f2fc577c9cef6f9c8f7d9e6f
2-
git+https://github.com/huridocs/queue-processor@26c9413ac4fd950ace4ee542d6734e6959e10ea4
1+
git+https://github.com/huridocs/pdf-document-layout-analysis@7cde0c113a5a312decd8a95fd759b48399ef4fb5
2+
git+https://github.com/huridocs/queue-processor@d30bf31e614694cf65f7117cd28cabf4afedfe55
33
graypy==2.1.0
44
PyYAML==6.0.1
55
pymongo==4.8.0

src/configuration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pathlib import Path
55
import graypy
66

7-
QUEUES_NAMES = os.environ.get("QUEUES_NAMES", "segmentation")
7+
QUEUES_NAMES = os.environ.get("QUEUES_NAMES", "segmentation development_segmentation")
88

99
SERVICE_HOST = os.environ.get("SERVICE_HOST", "http://127.0.0.1")
1010
SERVICE_PORT = os.environ.get("SERVICE_PORT", "5051")

src/delete_queues.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,34 @@
11
from redis import exceptions
22
from rsmq import RedisSMQ
33

4-
from configuration import TASK_QUEUE_NAME, RESULTS_QUEUE_NAME
4+
from configuration import QUEUES_NAMES
55

66
REDIS_HOST = "127.0.0.1"
77
REDIS_PORT = "6379"
88

99

1010
def delete_queues():
1111
try:
12-
queue = RedisSMQ(
13-
host=REDIS_HOST,
14-
port=REDIS_PORT,
15-
qname=TASK_QUEUE_NAME,
16-
quiet=False,
17-
)
18-
19-
queue.deleteQueue().exceptions(False).execute()
20-
queue.createQueue().exceptions(False).execute()
21-
22-
queue = RedisSMQ(
23-
host=REDIS_HOST,
24-
port=REDIS_PORT,
25-
qname=RESULTS_QUEUE_NAME,
26-
quiet=False,
27-
)
28-
29-
queue.deleteQueue().exceptions(False).execute()
30-
queue.createQueue().exceptions(False).execute()
12+
for queue_name in QUEUES_NAMES.split():
13+
queue = RedisSMQ(
14+
host=REDIS_HOST,
15+
port=REDIS_PORT,
16+
qname=queue_name + "_tasks",
17+
quiet=False,
18+
)
19+
20+
queue.deleteQueue().exceptions(False).execute()
21+
queue.createQueue().exceptions(False).execute()
22+
23+
queue = RedisSMQ(
24+
host=REDIS_HOST,
25+
port=REDIS_PORT,
26+
qname=queue_name + "_results",
27+
quiet=False,
28+
)
29+
30+
queue.deleteQueue().exceptions(False).execute()
31+
queue.createQueue().exceptions(False).execute()
3132

3233
print("Queues properly deleted")
3334

src/extract_segments.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99

1010

1111
def get_xml_name(task: Task) -> str:
12-
return f"{task.tenant}__{task.params.filename.lower().replace('.pdf', '.xml')}"
12+
xml_file_name = f"{task.tenant}__{task.params.filename.lower().replace('.pdf', '.xml')}"
13+
xml_file_name = xml_file_name if xml_file_name.endswith(".xml") else f"{xml_file_name}.xml"
14+
return xml_file_name
1315

1416

1517
def extract_segments(task: Task, xml_file_name: str = "") -> ExtractionData:

src/get_xml.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import os
2-
from os.path import join
32
from pathlib import Path
43

54
from configuration import DATA_PATH
65

76

87
def get_xml(xml_file_name: str) -> str:
9-
xml_file_path = Path(join(DATA_PATH, xml_file_name))
8+
xml_file_path = Path(DATA_PATH, xml_file_name)
109

1110
with open(xml_file_path, mode="r") as file:
1211
content = file.read()

src/QueueProcessor.py renamed to src/start_queue_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,5 +88,5 @@ def process_task(task):
8888
pass
8989

9090
queues_names = QUEUES_NAMES.split(" ")
91-
queue_processor = QueueProcessor(REDIS_HOST, REDIS_PORT, queues_names, service_logger)
91+
queue_processor = QueueProcessor(REDIS_HOST, REDIS_PORT, queues_names, service_logger, 7)
9292
queue_processor.start(process)

0 commit comments

Comments
 (0)