-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
ocr_embedded_doc_images.py
173 lines (114 loc) · 7.22 KB
/
ocr_embedded_doc_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
""" Example: ** Applying OCR to Images in LLMWare Library **
This example shows how to:
A. identify images in a library (post the initial parsing)
B. run an OCR against the images to derive the text from the image using the OCR
C. insert the text into the database library collection for subsequent retrieval.
Note: this example uses additional python dependencies:
-- pip3 install pytesseract
Note: this example uses an OCR engine, which is outside of the core llmware package. To install on Ubuntu:
-- sudo apt install tesseract-ocr
-- sudo apt install libtesseract-dev
[Other platforms:
-- Mac: brew install tesseract
-- Windows: GUI download installer - see UB-Mannheim @ www.github.com/UB-Mannheim/tesseract/wiki
Running this script will NOT make any changes to the original "image" block record in the text collection.
Rather than update/replace the existing record, the script will create a new supplemental entry for each 'image'.
Each new record will have the following attributes:
--"text" block with the text derived from the OCR, including the original source doc_ID, file source name and
page number for easy reference
--new block_ID starting at 100000 (safely out of the 'block namespace' of the original document, and easy to
identify as 'derived' text from an OCR, rather than an original part of the document
--text chunking applied to the OCR output, especially useful if it is a large image with a lot of text, e.g.,
a scanned page of a book - if the image contains a large text passage, it will be chunked and saved as
potentially several individual text blocks, 'chunked' according to the text chunk size parameter.
--a custom 'string' flag in special_field1 that indicates the text was created by an OCR, and includes a
reference to the original doc_ID and block_ID
--optional threshold for length of OCR to text to capture, e.g., if <10 characters captured, then may be
preferable to skip (higher probability that image is noisy)
"""
from llmware.library import Library
from llmware.configs import LLMWareConfig
from llmware.resources import CollectionRetrieval, CollectionWriter
from llmware.parsers import ImageParser
from importlib import util
if not util.find_spec("pytesseract"):
print("\nto run this example requires additional dependencies, including pytesseract - see comments above in "
"this script. to install pytesseract: pip3 install pytesseract.")
def ocr_images_in_library(library_name, add_new_text_block=False, chunk_size=400, min_chars=10):
lib = Library().load_library(library_name)
image_path = lib.image_path
# check here to see the images extracted from the original parsing
print("update: image source file path: ", image_path)
# query the collection DB by content_type == "image"
image_blocks = CollectionRetrieval(library_name).filter_by_key("content_type", "image")
doc_update_list = {}
new_text_created = 0
# iterate through the image blocks found
for i, block in enumerate(image_blocks):
# "external_files" points to the image name that will be found in the image_path above for the library
img_name = block["external_files"]
# each doc_ID is unique for the library collection
doc_id = block["doc_ID"]
# block_IDs are unique only for the document, and generally run in sequential ascending order
block_id = block["block_ID"]
# note: _id not used, but it is a good lookup key that can be easily inserted in special_field1 below
bid = block["_id"]
# preserve_spacing == True will keep \n \r \t and other white space
# preserve_spacing == False collapses the white space into a single space for 'more dense' text only
output = ImageParser(text_chunk_size=chunk_size).process_ocr(image_path,img_name,preserve_spacing=False)
print("update: ocr output: ", output)
# note: test before writing to the collection
if add_new_text_block:
for text_chunk in output:
if text_chunk.strip():
# optional to keep only more substantial chunks of text
if len(text_chunk) > min_chars:
# ad hoc tracker to keep incrementing the block_id for every new image in a particular doc
if doc_id in doc_update_list:
new_block_id = doc_update_list[doc_id]
doc_update_list.update({doc_id: new_block_id+1})
else:
new_block_id = 100000
doc_update_list.update({doc_id: new_block_id+1})
new_block = block
# feel free to adapt these attributes to fit for purpose
new_block.update({"block_ID": new_block_id})
new_block.update({"content_type": "text"})
new_block.update({"embedding_flags": {}})
new_block.update({"text_search": text_chunk})
new_block.update({"special_field1": f"OCR applied to image in document - "
f"{doc_id} - block - {block_id}"})
# new _id will be assigned by the database directly
if "_id" in new_block:
del new_block["_id"]
print("update: writing new text block - ", new_text_created, doc_id, block_id, text_chunk, new_block)
# creates the new record
CollectionWriter(ln).write_new_parsing_record(new_block)
new_text_created += 1
return new_text_created
# main execution script starts here
if __name__ == "__main__":
# select collection db - mongo, sqlite, or postgres
LLMWareConfig().set_active_db("postgres")
# create a library (source documents must have embedded images!)
ln = "my_library"
fp = "/path/to/pdf_or_office_files_with_images/"
lib = Library().create_new_library(ln)
# parse the documents
lib.add_files(fp, get_images=True,get_tables=True, chunk_size=400, max_chunk_size=600,
smart_chunking=1, verbose_level=2)
print("done parsing")
# runs ocr on the images in the newly-created library
# set add_new_text_block == True to add new rows in the database (otherwise, will just run the OCR in memory)
new_blocks = ocr_images_in_library(ln,add_new_text_block=False,chunk_size=400, min_chars=10)
print("done with ocr processing")
g = CollectionRetrieval(ln).get_whole_collection()
# may need to loop through the iterator if extremely large collection (otherwise, pull_all into memory OK)
blocks = g.pull_all()
ocr_count = 0
# look at the new text entries created and inserted into the collection
for i, b in enumerate(blocks):
if b["block_ID"] > 9999:
print("update: new ocr text entry: ", ocr_count, b["doc_ID"], b["block_ID"], b["content_type"],
b["special_field1"], b["text_search"])
ocr_count += 1