Skip to content

Commit fc08689

Browse files
committed
Fix adi bugs
1 parent 424e090 commit fc08689

21 files changed

+164
-106
lines changed

ai_search_with_adi/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,14 @@ The properties returned from the ADI Custom Skill are then used to perform the f
3838

3939
## Provided Notebooks \& Utilities
4040

41-
- `./ai_search.py`, `./deployment.py` provide an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search.
41+
- `./ai_search.py`, `./deploy.py` provide an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search.
4242
- `./function_apps/indexer` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown.
4343
- `./rag_with_ai_search.ipynb` provides example of how to utilise the AI Search plugin to query the index.
4444

45+
## Deploying AI Search Setup
46+
47+
To deploy the pre-built index and associated indexer / skillset setup, see instructions in `./ai_search/README.md`.
48+
4549
## ADI Custom Skill
4650

4751
Deploy the associated function app and required resources. You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_ai_search` HTTP endpoint.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# AI Search Indexing with Azure Document Intelligence - Pre-built Index Setup
2+
3+
The associated scripts in this portion of the repository contains pre-built scripts to deploy the skillset with Azure Document Intelligence.
4+
5+
## Steps
6+
7+
1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
8+
2. Adjust `rag_documents.py` with any changes to the index / indexer. The `get_skills()` method implements the skills pipeline. Make any adjustments here in the skills needed to enrich the data source.
9+
3. Run `deploy.py` with the following args:
10+
11+
- `indexer_type rag`. This selects the `rag_documents` sub class.
12+
- `enable_page_chunking True`. This determines whether page wise chunking is applied in ADI, or whether the inbuilt skill is used for TextSplit. **Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.**
13+
- `rebuild`. Whether to delete and rebuild the index.
14+
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
15+
16+
## ai_search.py & environment.py
17+
18+
This includes a variety of helper files and scripts to deploy the index setup. This is useful for CI/CD to avoid having to write JSON files manually or use the UI to deploy the pipeline.

ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py renamed to ai_search_with_adi/function_app/adi_2_ai_search.py

Lines changed: 102 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@
1212
from PIL import Image
1313
import io
1414
import logging
15-
from common.storage_account import StorageAccountHelper
15+
from storage_account import StorageAccountHelper
1616
import concurrent.futures
1717
import json
18-
from openai import AzureOpenAI
18+
from openai import AsyncAzureOpenAI
19+
import openai
1920

2021

2122
def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
@@ -42,7 +43,7 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
4243

4344

4445
def clean_adi_markdown(
45-
markdown_text: str, page_no: int, remove_irrelevant_figures=False
46+
markdown_text: str, page_no: int = None, remove_irrelevant_figures=False
4647
):
4748
"""Clean Markdown text extracted by the Azure Document Intelligence service.
4849
@@ -56,21 +57,6 @@ def clean_adi_markdown(
5657
str: The cleaned Markdown text.
5758
"""
5859

59-
# # Remove the page number comment
60-
# page_number_pattern = r"<!-- PageNumber=\"\d+\" -->"
61-
# cleaned_text = re.sub(page_number_pattern, "", markdown_text)
62-
63-
# # Replace the page header comment with its content
64-
# page_header_pattern = r"<!-- PageHeader=\"(.*?)\" -->"
65-
# cleaned_text = re.sub(
66-
# page_header_pattern, lambda match: match.group(1), cleaned_text
67-
# )
68-
69-
# # Replace the page footer comment with its content
70-
# page_footer_pattern = r"<!-- PageFooter=\"(.*?)\" -->"
71-
# cleaned_text = re.sub(
72-
# page_footer_pattern, lambda match: match.group(1), cleaned_text
73-
# )
7460
output_dict = {}
7561
comment_patterns = r"<!-- PageNumber=\"\d+\" -->|<!-- PageHeader=\".*?\" -->|<!-- PageFooter=\".*?\" -->"
7662
cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL)
@@ -94,7 +80,7 @@ def clean_adi_markdown(
9480
output_dict["sections"] = doc_metadata
9581

9682
# add page number when chunk by page is enabled
97-
if page_no > -1:
83+
if page_no is not None:
9884
output_dict["page_number"] = page_no
9985

10086
return output_dict
@@ -135,7 +121,7 @@ def update_figure_description(md_content, img_description, idx):
135121
return new_md_content
136122

137123

138-
async def understand_image_with_gptv(image_base64, caption):
124+
async def understand_image_with_gptv(image_base64, caption, tries_left=3):
139125
"""
140126
Generates a description for an image using the GPT-4V model.
141127
@@ -153,57 +139,81 @@ async def understand_image_with_gptv(image_base64, caption):
153139
deployment_name = os.environ["AzureAI__GPT4V_Deployment"]
154140
api_base = os.environ["AzureAI__GPT4V_APIbase"]
155141

156-
client = AzureOpenAI(
157-
api_key=api_key,
158-
api_version=api_version,
159-
base_url=f"{api_base}/openai/deployments/{deployment_name}",
160-
)
161-
162-
# We send both image caption and the image body to GPTv for better understanding
163-
if caption != "":
164-
response = client.chat.completions.create(
165-
model=deployment_name,
166-
messages=[
167-
{"role": "system", "content": "You are a helpful assistant."},
168-
{
169-
"role": "user",
170-
"content": [
171-
{
172-
"type": "text",
173-
"text": f"Describe this image (note: it has image caption: {caption}):",
174-
},
142+
try:
143+
async with AsyncAzureOpenAI(
144+
api_key=api_key,
145+
api_version=api_version,
146+
base_url=f"{api_base}/openai/deployments/{deployment_name}",
147+
) as client:
148+
# We send both image caption and the image body to GPTv for better understanding
149+
if caption != "":
150+
response = await client.chat.completions.create(
151+
model=deployment_name,
152+
messages=[
153+
{"role": "system", "content": "You are a helpful assistant."},
175154
{
176-
"type": "image_base64",
177-
"image_base64": {"image": image_base64},
155+
"role": "user",
156+
"content": [
157+
{
158+
"type": "text",
159+
"text": f"Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'. (note: it has image caption: {caption}):",
160+
},
161+
{
162+
"type": "image_base64",
163+
"image_base64": {"image": image_base64},
164+
},
165+
],
178166
},
179167
],
180-
},
181-
],
182-
max_tokens=MAX_TOKENS,
183-
)
168+
max_tokens=MAX_TOKENS,
169+
)
184170

185-
else:
186-
response = client.chat.completions.create(
187-
model=deployment_name,
188-
messages=[
189-
{"role": "system", "content": "You are a helpful assistant."},
190-
{
191-
"role": "user",
192-
"content": [
193-
{"type": "text", "text": "Describe this image:"},
171+
else:
172+
response = await client.chat.completions.create(
173+
model=deployment_name,
174+
messages=[
175+
{"role": "system", "content": "You are a helpful assistant."},
194176
{
195-
"type": "image_base64",
196-
"image_base64": {"image": image_base64},
177+
"role": "user",
178+
"content": [
179+
{
180+
"type": "text",
181+
"text": "Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'.",
182+
},
183+
{
184+
"type": "image_base64",
185+
"image_base64": {"image": image_base64},
186+
},
187+
],
197188
},
198189
],
199-
},
200-
],
201-
max_tokens=MAX_TOKENS,
202-
)
190+
max_tokens=MAX_TOKENS,
191+
)
192+
193+
img_description = response.choices[0].message.content
203194

204-
img_description = response.choices[0].message.content
195+
logging.info(f"Image Description: {img_description}")
205196

206-
return img_description
197+
return img_description
198+
except openai.RateLimitError as e:
199+
logging.error("OpenAI Rate Limit Error: %s", e)
200+
201+
if tries_left > 0:
202+
logging.info(
203+
"Retrying understanding of image with %s tries left.", tries_left
204+
)
205+
remaining_tries = tries_left - 1
206+
backoff = 20 ** (3 - remaining_tries)
207+
await asyncio.sleep(backoff)
208+
return await understand_image_with_gptv(
209+
image_base64, caption, tries_left=remaining_tries
210+
)
211+
else:
212+
raise Exception("OpenAI Rate Limit Error: No retries left.") from e
213+
except (openai.OpenAIError, openai.APIConnectionError) as e:
214+
logging.error("OpenAI Error: %s", e)
215+
216+
raise Exception("OpenAI Rate Limit Error: No retries left.") from e
207217

208218

209219
def pil_image_to_base64(image, image_format="JPEG"):
@@ -263,7 +273,9 @@ async def process_figures_from_extracted_content(
263273

264274
image_base64 = pil_image_to_base64(cropped_image)
265275

266-
img_description += await understand_image_with_gptv(image_base64)
276+
img_description += await understand_image_with_gptv(
277+
image_base64, figure.caption.content
278+
)
267279
logging.info(f"\tDescription of figure {idx}: {img_description}")
268280

269281
markdown_content = update_figure_description(
@@ -287,13 +299,12 @@ def create_page_wise_content(result: AnalyzeResult) -> list:
287299

288300
page_wise_content = []
289301
page_numbers = []
290-
page_number = 0
291-
for page in result.pages:
302+
303+
for page_number, page in enumerate(result.pages):
292304
page_content = result.content[
293305
page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"]
294306
]
295307
page_wise_content.append(page_content)
296-
page_number += 1
297308
page_numbers.append(page_number)
298309

299310
return page_wise_content, page_numbers
@@ -311,7 +322,6 @@ async def analyse_document(file_path: str) -> AnalyzeResult:
311322
AnalyzeResult: The result of the document analysis."""
312323
with open(file_path, "rb") as f:
313324
file_read = f.read()
314-
# base64_encoded_file = base64.b64encode(file_read).decode("utf-8")
315325

316326
async with DocumentIntelligenceClient(
317327
endpoint=os.environ["AIService__Services__Endpoint"],
@@ -335,6 +345,16 @@ async def analyse_document(file_path: str) -> AnalyzeResult:
335345

336346

337347
async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> dict:
348+
"""Process the extracted content from the Azure Document Intelligence service and prepare it for Azure Search.
349+
350+
Args:
351+
-----
352+
record (dict): The record containing the extracted content.
353+
chunk_by_page (bool): Whether to chunk the content by page.
354+
355+
Returns:
356+
--------
357+
dict: The processed content ready for Azure Search."""
338358
logging.info("Python HTTP trigger function processed a request.")
339359

340360
storage_account_helper = StorageAccountHelper()
@@ -431,20 +451,26 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
431451
try:
432452
if chunk_by_page:
433453
cleaned_result = []
434-
markdown_content, page_no = create_page_wise_content(result)
435-
tasks = [
454+
markdown_content, page_numbers = create_page_wise_content(result)
455+
content_with_figures_tasks = [
436456
process_figures_from_extracted_content(
437-
temp_file_path, page_content, result.figures, page_number=idx
457+
temp_file_path,
458+
page_content,
459+
result.figures,
460+
page_number=page_number,
438461
)
439-
for idx, page_content in enumerate(markdown_content)
462+
for page_content, page_number in zip(markdown_content, page_numbers)
440463
]
441-
content_with_figures = await asyncio.gather(*tasks)
464+
content_with_figures = await asyncio.gather(*content_with_figures_tasks)
465+
442466
with concurrent.futures.ProcessPoolExecutor() as executor:
443467
futures = {
444468
executor.submit(
445-
clean_adi_markdown, page_content, False
469+
clean_adi_markdown, page_content, page_number, False
446470
): page_content
447-
for page_content in content_with_figures
471+
for page_content, page_number in zip(
472+
content_with_figures, page_numbers
473+
)
448474
}
449475
for future in concurrent.futures.as_completed(futures):
450476
cleaned_result.append(future.result())
@@ -455,7 +481,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
455481
temp_file_path, markdown_content, result.figures
456482
)
457483
cleaned_result = clean_adi_markdown(
458-
content_with_figures, page_no=-1, remove_irrelevant_figures=False
484+
content_with_figures, remove_irrelevant_figures=False
459485
)
460486
except Exception as e:
461487
logging.error(e)
@@ -483,7 +509,4 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) ->
483509

484510
logging.info(f"final output: {json_str}")
485511

486-
return {
487-
"recordId": record["recordId"],
488-
"data": {"extracted_content": cleaned_result},
489-
}
512+
return src

ai_search_with_adi/function_apps/common/payloads/header.py renamed to ai_search_with_adi/function_app/common/payloads/header.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class TaskEnum(Enum):
1919
PENDING_INDEX_COMPLETION = "pending_index_completion"
2020
PENDING_INDEX_TRIGGER = "pending_index_trigger"
2121

22+
2223
class Header(BaseModel):
2324
"""Header model"""
2425

ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py renamed to ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ class PendingIndexTriggerBody(BaseModel):
2020
id_name: Optional[str] = Field(
2121
None, description="The text name for the integer ID field"
2222
)
23-
additional_field: Optional[str] = Field(None, description="Description of additional_field")
23+
additional_field: Optional[str] = Field(
24+
None, description="Description of additional_field"
25+
)
2426

2527
__config__ = ConfigDict(extra="ignore")
2628

0 commit comments

Comments
 (0)