diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00ba7635d9..49e3878a0e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.15.12-dev1
+## 0.15.12-dev2
### Enhancements
+* **Improve `pdfminer` element processing** Implemented splitting of `pdfminer` elements (groups of text chunks) into smaller bounding boxes (text lines). This prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text.
+
### Features
### Fixes
@@ -22,7 +24,6 @@
* **Enhance `pdfminer` element cleanup** Expand removal of `pdfminer` elements to include those inside all `non-pdfminer` elements, not just `tables`.
* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
* **Vectorize pdfminer elements deduplication computation**. Use `numpy` operations to compute IOU and sub-region membership instead of using simply loop. This improves the speed of deduplicating elements for pages with a lot of elements.
-* **Add deprecation warning to embed code**
### Features
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 3d41d6e304..4276b3f8d0 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -159,7 +159,7 @@ def test_partition_pdf_local_raises_with_no_filename():
[
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
- (PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
+ (PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer", "ocr_tesseract"}),
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
],
)
@@ -552,7 +552,7 @@ def test_partition_pdf_with_copy_protection():
filename = example_doc_path("pdf/copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
- idx = 2
+ idx = 22
assert elements[idx].text == title
assert {element.metadata.page_number for element in elements} == {1, 2}
assert elements[idx].metadata.detection_class_prob is not None
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
index 5dcc804cf7..e01587516e 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -2,7 +2,7 @@
import pytest
from PIL import Image
from unstructured_inference.constants import Source as InferenceSource
-from unstructured_inference.inference.elements import Rectangle, TextRegion
+from unstructured_inference.inference.elements import EmbeddedTextRegion, Rectangle, TextRegion
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
from unstructured.partition.pdf_image.pdfminer_processing import (
@@ -11,6 +11,7 @@
boxes_self_iou,
clean_pdfminer_duplicate_image_elements,
clean_pdfminer_inner_elements,
+ remove_duplicate_embedded_text,
)
from unstructured.partition.utils.constants import Source
@@ -209,3 +210,22 @@ def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
def test_boxes_self_iou(coords, threshold, expected):
bboxes = [Rectangle(*row) for row in coords]
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
+
+
+def test_remove_duplicate_embedded_text():
+ sample_elements = [
+ EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
+ EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
+ EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
+ ]
+
+ result = remove_duplicate_embedded_text(sample_elements)
+
+ # Check that duplicates were removed and only 2 unique elements remain
+ assert len(result) == 2
+ assert result[0].text == "Text 2"
+ assert result[1].text == "Text 3"
+
+ # Ensure the duplicate was removed by checking that result contains no redundant bboxes
+ assert result[0].bbox == Rectangle(0, 0, 10, 10)
+ assert result[1].bbox == Rectangle(20, 20, 30, 30)
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
new file mode 100644
index 0000000000..075a4e151e
--- /dev/null
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
@@ -0,0 +1,28 @@
+from unittest.mock import MagicMock
+
+from pdfminer.layout import LTContainer, LTTextLine
+
+from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
+
+
+def test_extract_text_objects_nested_containers():
+ """Test extract_text_objects with nested LTContainers."""
+ # Mock LTTextLine objects
+ mock_text_line1 = MagicMock(spec=LTTextLine)
+ mock_text_line2 = MagicMock(spec=LTTextLine)
+
+ # Mock inner container containing one LTTextLine
+ mock_inner_container = MagicMock(spec=LTContainer)
+ mock_inner_container.__iter__.return_value = [mock_text_line2]
+
+ # Mock outer container containing another LTTextLine and the inner container
+ mock_outer_container = MagicMock(spec=LTContainer)
+ mock_outer_container.__iter__.return_value = [mock_text_line1, mock_inner_container]
+
+ # Call the function with the outer container
+ result = extract_text_objects(mock_outer_container)
+
+ # Assert both text line objects are extracted, even from nested containers
+ assert len(result) == 2
+ assert mock_text_line1 in result
+ assert mock_text_line2 in result
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 0460016bab..54fdfe8bed 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -6,7 +6,6 @@
import json
import os
import pathlib
-import sys
import tempfile
import warnings
from importlib import import_module
@@ -505,7 +504,7 @@ def test_auto_partition_org_from_file():
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
- file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+ file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
@@ -515,20 +514,15 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_
strategy=PartitionStrategy.HI_RES,
)
- # NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
- # (on Mac) than it does in CI. Basically the first element when partitioning locally is split
- # in two when partitioning on CI. Other than that split the text is exactly the same.
- idx = 2 if sys.platform == "darwin" else 3
-
- e = elements[idx]
+ e = elements[0]
assert isinstance(e, Title)
- assert e.text.startswith("LayoutParser")
+ assert e.text.startswith("eastern mediterranean")
assert e.metadata.filename == os.path.basename(file_path)
assert e.metadata.file_directory == os.path.split(file_path)[0]
- e = elements[idx + 1]
+ e = elements[1]
assert isinstance(e, NarrativeText)
- assert e.text.startswith("Zejiang Shen")
+ assert e.text.startswith("We’re investing")
@pytest.mark.parametrize(
@@ -536,7 +530,7 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
- file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+ file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
@@ -547,16 +541,13 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
strategy=PartitionStrategy.HI_RES,
)
- # NOTE(scanny): see "from_filename" version of this test above for more on this oddness
- idx = 2 if sys.platform == "darwin" else 3
-
- e = elements[idx]
+ e = elements[0]
assert isinstance(e, Title)
- assert e.text.startswith("LayoutParser")
+ assert e.text.startswith("eastern mediterranean")
- e = elements[idx + 1]
+ e = elements[1]
assert isinstance(e, NarrativeText)
- assert e.text.startswith("Zejiang Shen")
+ assert e.text.startswith("We’re investing")
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
index 18bae2ec35..92d6daaa1c 100644
--- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
@@ -307,10 +307,32 @@
}
}
},
+ {
+ "type": "Header",
+ "element_id": "9aa82368657b60536f152fd413aec316",
+ "text": "Core Skills for Biomedical Data Scientists",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 2,
+ "data_source": {
+ "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+ "version": "237960874052008560436652606947751982249",
+ "record_locator": {
+ "protocol": "abfs",
+ "remote_file_path": "abfs://container1/"
+ },
+ "date_created": "1678440764.0",
+ "date_modified": "1678440764.0"
+ }
+ }
+ },
{
"type": "UncategorizedText",
- "element_id": "b810a8721369c3551c942aab9011b7d1",
- "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________",
+ "element_id": "4f2dbe3656a9ebc60c7e3426ad3cb3e3",
+ "text": "_____________________________________________________________________________________________",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -331,7 +353,7 @@
},
{
"type": "NarrativeText",
- "element_id": "c8fdefac1ae82fa42caeceff04853415",
+ "element_id": "cd359ae8c49885ead47318021438eead",
"text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.",
"metadata": {
"filetype": "application/pdf",
@@ -353,7 +375,7 @@
},
{
"type": "Title",
- "element_id": "b5b7392d0a946f5016bfa8ad0c248a9b",
+ "element_id": "bf8321a34edb7103ec4209f3e4a8a8da",
"text": "Methodology",
"metadata": {
"filetype": "application/pdf",
@@ -375,7 +397,7 @@
},
{
"type": "NarrativeText",
- "element_id": "d9d8e38d221ae621c0ddbcabaa4a28b4",
+ "element_id": "1e1d3d1a5c1397fc588393568d829bc8",
"text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:",
"metadata": {
"filetype": "application/pdf",
@@ -397,7 +419,7 @@
},
{
"type": "ListItem",
- "element_id": "ba70aa3bc3ad0dec6a62939c94c5a20c",
+ "element_id": "45d7ff56632d66a2ab2d4dd2716d4d2e",
"text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.",
"metadata": {
"filetype": "application/pdf",
@@ -419,7 +441,7 @@
},
{
"type": "ListItem",
- "element_id": "24724b1f0d20a6575f2782fd525c562f",
+ "element_id": "bf452aac5123fcedda30dd6ed179f41c",
"text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A.",
"metadata": {
"filetype": "application/pdf",
@@ -441,7 +463,7 @@
},
{
"type": "ListItem",
- "element_id": "5e6c73154a1e5f74780c69afbc9bc084",
+ "element_id": "ca176cbef532792b1f11830ff7520587",
"text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.",
"metadata": {
"filetype": "application/pdf",
@@ -463,7 +485,7 @@
},
{
"type": "NarrativeText",
- "element_id": "249f6c76b2c99dadbefb8b8811b0d4cd",
+ "element_id": "11b170fedd889c3b895bbd28acd811ca",
"text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist.",
"metadata": {
"filetype": "application/pdf",
@@ -485,8 +507,30 @@
},
{
"type": "NarrativeText",
- "element_id": "6543ce4e447de8fb3db98ceb06a50c28",
- "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
+ "element_id": "2665aadf75bca259f1f5b4c91a53a301",
+ "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 2,
+ "data_source": {
+ "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+ "version": "237960874052008560436652606947751982249",
+ "record_locator": {
+ "protocol": "abfs",
+ "remote_file_path": "abfs://container1/"
+ },
+ "date_created": "1678440764.0",
+ "date_modified": "1678440764.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "8bbfe1c3e6bca9a33226d20d69b2297a",
+ "text": "2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -507,7 +551,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "1a6ff96d028f18331a9d9c9748b49321",
+ "element_id": "dd4a661e1a3c898a5cf6328ba56b924d",
"text": "2",
"metadata": {
"filetype": "application/pdf",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
index bfd924c00f..164e9cfa2f 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
@@ -221,7 +221,7 @@
"type": "NarrativeText"
},
{
- "element_id": "f08a5de1783556706ed5bb30cd0a20b7",
+ "element_id": "c53ddaba48097526e6995e19ceeb99d2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -230,11 +230,11 @@
],
"page_number": 1
},
- "text": "This data article contains data related to the research article entitled “enhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product” (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) influenced corrosion resistance of stainless steel. Inhibition efficiency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder influencing the redox mechan- ism reactions responsible for corrosion and surface deterioration.",
+ "text": "This data article contains data related to the research article entitled “enhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product” (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- namic polarization techniques. The presence of the inhibitor (egg shell powder) influenced corrosion resistance of stainless steel. Inhibition efficiency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder influencing the redox mechan-",
"type": "NarrativeText"
},
{
- "element_id": "7cb94187e8811c4a0352d86f3f228a79",
+ "element_id": "9eb17e280cf65e8e0fee692c37a95588",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -243,11 +243,11 @@
],
"page_number": 1
},
- "text": "reactions responsible for corrosion and surface deterioration. © 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license",
+ "text": "ism reactions responsible for corrosion and surface deterioration. & 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license",
"type": "NarrativeText"
},
{
- "element_id": "436c3af036856e288b37851de57c79ee",
+ "element_id": "e4929aeb09c891d47401db4b7944f6b4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -256,7 +256,7 @@
],
"page_number": 1
},
- "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).",
+ "text": "(http://creativecommons.org/licenses/by-nc-nd/4.0/).",
"type": "NarrativeText"
},
{
@@ -273,7 +273,7 @@
"type": "Title"
},
{
- "element_id": "aeff16fd11e1af6e1cca85bc484677f5",
+ "element_id": "7b3368f23f1fcba32888fff9774a7bdd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -282,7 +282,7 @@
],
"page_number": 1
},
- "text": "Subject area More specific subject area Surface science and engineering Type of data",
+ "text": "Subject area Materials engineering More specific subject area Surface science and engineering Type of data Table and figure",
"type": "NarrativeText"
},
{
@@ -299,7 +299,7 @@
"type": "ListItem"
},
{
- "element_id": "e78977d490a7dba2a9358e93821a885f",
+ "element_id": "4f3da5135ad2a99c9a2fc0e3372d1b26",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -308,7 +308,20 @@
],
"page_number": 1
},
- "text": "https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).",
+ "text": "https://doi.org/10.1016/j.dib.2018.11.134",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "153f469e204d0eee0f5d96da52039b9a",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).",
"type": "NarrativeText"
},
{
@@ -325,7 +338,7 @@
"type": "ListItem"
},
{
- "element_id": "774ec56cd1d6c2c1a37b3fa09771fd20",
+ "element_id": "6277cd91869e10d6256f362b08d3e789",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -334,7 +347,7 @@
],
"page_number": 2
},
- "text": "How data were acquired Data format Experimental factors Experimental features Data source location Accessibility Related research article The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230.",
+ "text": "How data were acquired The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Data format Raw, analyzed Experimental factors The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Experimental features Inhibitor concentration, exposure time Data source location Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Accessibility Data are available within this article Related research article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230.",
"type": "Table"
},
{
@@ -351,7 +364,7 @@
"type": "Title"
},
{
- "element_id": "5738486524b113c530187175089c2ada",
+ "element_id": "f2b57562924402b85f6eb07925ea1654",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -360,11 +373,24 @@
],
"page_number": 2
},
- "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.",
+ "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "d9f6efffd49ef59e671206bfb5f094de",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 2
+ },
+ "text": "Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.",
"type": "ListItem"
},
{
- "element_id": "01a22eb39213ca23143f65379ffc44b3",
+ "element_id": "2a1e46bc589c5eca777b657e141e824b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -373,11 +399,24 @@
],
"page_number": 2
},
- "text": "© The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.",
+ "text": "© The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "2c42182c07ecdb96362b534a8fad4d59",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 2
+ },
+ "text": "can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.",
"type": "ListItem"
},
{
- "element_id": "bd0f9e47bdef04cdb80722f75151f134",
+ "element_id": "c6fd85f9219a2c75bb1f8c1889bb2b5f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -390,7 +429,7 @@
"type": "NarrativeText"
},
{
- "element_id": "38ada84c0543e3c8017d09cc219839c3",
+ "element_id": "07cdb1623f501ea23a343039300178cc",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -403,7 +442,7 @@
"type": "ListItem"
},
{
- "element_id": "ad92455add13d8b0c3d49eefb5221be6",
+ "element_id": "4bf8165bcb21c5296b741ba0f9e38f93",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -416,7 +455,7 @@
"type": "Title"
},
{
- "element_id": "689fdb087caca4bb2dcc846d7613fff2",
+ "element_id": "85918ce2a03e9f236137a0fe72985af0",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -429,7 +468,7 @@
"type": "NarrativeText"
},
{
- "element_id": "eacb5d7bd9e8dd184f821f33ec68bac0",
+ "element_id": "93537983496efa695cfc65ad895d9412",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -438,11 +477,11 @@
],
"page_number": 2
},
- "text": "30 ) g m ( s s o l t h g e W i 20 10g 8g 6g 4g 2g Control 10 48 96 144 192 Exposure Time (Hours) ",
+ "text": "30 10g ) g m ( 8g 6g s s o l t h 20 4g 2g Control g i e W 10 48 96 144 192 Exposure Time (Hours) ",
"type": "Image"
},
{
- "element_id": "0fbbd848dbf4018984cb712fea9e927b",
+ "element_id": "76b94e78b638b79374e266284c1a0d83",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -451,7 +490,7 @@
],
"page_number": 2
},
- "text": "Fig. 1. Weight loss versus exposure time for stainless steel presence of ES. immersed in 0.5 M H2SO4 solution in the absence and",
+ "text": "Fig. 1. Weight loss versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES.",
"type": "FigureCaption"
},
{
@@ -468,7 +507,7 @@
"type": "NarrativeText"
},
{
- "element_id": "ac5ec43db85d40effe0a4c2e82f9db5d",
+ "element_id": "1080514a81075260aff8e2d2daadc08f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -477,7 +516,7 @@
],
"page_number": 3
},
- "text": "2.7 ) r a e y / m m ( e t a r n o s o r r o C i 1.8 0.9 10g 8g 6g 4g 2g Control 24 48 72 96 120 144 168 192 Exposure time ",
+ "text": "2.7 ) r a e y / m m ( e 1.8 t a r n o 10g 8g i s o r r o C 0.9 6g 4g 2g Control 24 48 72 96 120 144 168 192 Exposure time ",
"type": "Image"
},
{
@@ -494,7 +533,7 @@
"type": "FigureCaption"
},
{
- "element_id": "04a954754e3a2a49f6b608bb384f3eae",
+ "element_id": "86202dccdd86c3f690e636d77f97a80d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -503,7 +542,7 @@
],
"page_number": 3
},
- "text": "100 90 ) % ( y c n e c i f f i E n o i t i b h n I i 80 70 60 50 40 30 2g 4g 6g 8g 10g 20 10 0 20 40 60 80 100 120 140 160 180 Exposure Time (Hours) ",
+ "text": "100 90 80 2g 4g ) % ( y c n e 70 60 6g 8g 10g i c i f f E 50 n o i t i 40 b i h n I 30 20 10 0 20 40 60 80 100 120 140 160 180 Exposure Time (Hours) ",
"type": "Image"
},
{
@@ -598,7 +637,7 @@
"type": "FigureCaption"
},
{
- "element_id": "81bc21d3d3f853a17bb49bed4620be2f",
+ "element_id": "0a05f8568758bcff4e2912e0fd11eb02",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -607,11 +646,24 @@
],
"page_number": 4
},
- "text": "Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution.",
- "type": "UncategorizedText"
+ "text": "Table 1",
+ "type": "Title"
+ },
+ {
+ "element_id": "8cf43a41b44a990a754bf2f4d68a148b",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4
+ },
+ "text": "Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution.",
+ "type": "FigureCaption"
},
{
- "element_id": "b6e672fb23fd678860f5a255190cf5d7",
+ "element_id": "7e0388ec6fd4ec451d96232e30d41e7c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -624,7 +676,7 @@
"type": "Table"
},
{
- "element_id": "68599427d5b8eadfb8d7f962655f0597",
+ "element_id": "d61e56d1a4c761ad3c69f4b970ba4f3c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -633,11 +685,11 @@
],
"page_number": 4
},
- "text": "Corrosion rate (mm/year)",
+ "text": "rate (mm/year)",
"type": "Title"
},
{
- "element_id": "f55e192ad2e6bca248a2dae6c281cc69",
+ "element_id": "3a5534c2aafc2d8a4c0b65d530d00ab3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -650,7 +702,7 @@
"type": "NarrativeText"
},
{
- "element_id": "88a4bcf370d7f204b6c93e742a872521",
+ "element_id": "3aca0cc2ad685c5ca25646c82eeb73a8",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -659,11 +711,11 @@
],
"page_number": 4
},
- "text": "12 10 8 0 / C 6 4 2 2 4 6 8 10 Concentration (g) ",
+ "text": "12 10 8 0 / C 6 4 2 2 4 6 8 10 Concentration (g) ",
"type": "Image"
},
{
- "element_id": "c835ef0f50c0cb95aeeeb4be8e5eed0e",
+ "element_id": "5746a0363a91513e4bed4e740cba2fa5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -676,7 +728,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "5e8e16200ca068ad87eeadee54bb3e4f",
+ "element_id": "a98eb6a3340c566043ba7e8c0bd5208e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -923,20 +975,7 @@
"type": "NarrativeText"
},
{
- "element_id": "02e068fc4978fc17556d1708744d48c9",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "languages": [
- "eng"
- ],
- "page_number": 6
- },
- "text": "87.6W = (ar",
- "type": "UncategorizedText"
- },
- {
- "element_id": "3ea6811be10934fd3c8d884e9174047b",
+ "element_id": "a109843109833c3c89155a3df5baf295",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -945,11 +984,11 @@
],
"page_number": 6
},
- "text": "Corrosion rate CRð ð1Þ",
+ "text": "Corrosion rate CRð Þ ¼ 87:6W DAT ð1Þ",
"type": "Formula"
},
{
- "element_id": "7459b20ea68d65b7a967500f22223507",
+ "element_id": "ab7d652abb3f20bfce1acd30c245f8cd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -962,7 +1001,7 @@
"type": "NarrativeText"
},
{
- "element_id": "afe4e57ab4c1f95ae48924fc26f9f598",
+ "element_id": "20819cbe14c5bcd5f247ff810061b23c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -975,7 +1014,7 @@
"type": "Formula"
},
{
- "element_id": "8927c6b0343d5688f2b8ef20da2067a4",
+ "element_id": "d0f8a91f05a9127a338e65478113588d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -988,7 +1027,7 @@
"type": "Formula"
},
{
- "element_id": "6b494432a9388fc706ffcd80a99288e0",
+ "element_id": "8753502ab1103730de33fe05c1bed34b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -1001,7 +1040,7 @@
"type": "NarrativeText"
},
{
- "element_id": "066b524dc82e4d6f6f8564457b56b88a",
+ "element_id": "7d233f7c1d7b52dc704cb3c68588dad1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -1014,7 +1053,7 @@
"type": "NarrativeText"
},
{
- "element_id": "171ed5a4187c43f2ce865f372ffc80e0",
+ "element_id": "25833fe4955e01b455cf77d0cfd7d71f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -1027,7 +1066,7 @@
"type": "NarrativeText"
},
{
- "element_id": "e830aa2e186bf2d03f4bccc04a39b546",
+ "element_id": "57906367eca399b52f7eecbf78345bf4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -1040,7 +1079,7 @@
"type": "Title"
},
{
- "element_id": "fd75f111787e450e3f278b57ad154531",
+ "element_id": "cff55ae1916232dbda5239f59c897cb9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
index 32dc77c7c3..64c57d6dfc 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
@@ -143,7 +143,7 @@
"type": "NarrativeText"
},
{
- "element_id": "016c6f30b572558eeb8b71edc23834ac",
+ "element_id": "1cd38ce7fe0ffbaf86c4dc77b2de9fb3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -152,11 +152,102 @@
],
"page_number": 1
},
- "text": "a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072, Australia f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India",
+ "text": "a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "53da3cdbc0b8bf4d18be34d28ff5b23e",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India",
+ "type": "ListItem"
+ },
+ {
+ "element_id": "57631ad0ec5b8f3569ccf4e7e500a0ed",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "550fdd63139a3a90a35bc871a4a54546",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia",
+ "type": "ListItem"
+ },
+ {
+ "element_id": "8c49eb856f0d4a8e36e4a83c02b018bd",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "©",
"type": "UncategorizedText"
},
{
- "element_id": "b211f93156fafa7348f2be1e4f569c21",
+ "element_id": "03b4116b32ee9de3beea142b52694b19",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072,",
+ "type": "UncategorizedText"
+ },
+ {
+ "element_id": "bfcbabb9ed9169f6a4be19576064f702",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Australia",
+ "type": "Title"
+ },
+ {
+ "element_id": "85875ebbc1de554e92edc54674add1d5",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "f9f33fff8fbb981301df3055b60e12c7",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -169,7 +260,7 @@
"type": "NarrativeText"
},
{
- "element_id": "642e946699f33e26140357538d0dcc72",
+ "element_id": "4f3f69dd17ddae776c656ec73d9837ae",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -182,7 +273,7 @@
"type": "NarrativeText"
},
{
- "element_id": "2354c20c25e668c32da0f18eea88758e",
+ "element_id": "34522460857b10c63d8c2c8d2fbb3087",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -195,7 +286,7 @@
"type": "NarrativeText"
},
{
- "element_id": "ff825f9f1f67d0a09964b630e6e8b20c",
+ "element_id": "a807aca2a8ed5b05247afce4462a5265",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -208,7 +299,7 @@
"type": "NarrativeText"
},
{
- "element_id": "135f9d4c925434bc91d93fd9f977cf24",
+ "element_id": "7961184fa99a7e10d50f37a0b56f8fb6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -217,11 +308,11 @@
],
"page_number": 1
},
- "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).",
+ "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license",
"type": "NarrativeText"
},
{
- "element_id": "88dbc4847fce9fb9cd4c1bd72209f5e2",
+ "element_id": "f3f737f73d82d84c0a644d2539c719f9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -230,11 +321,24 @@
],
"page_number": 1
},
- "text": "DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007 n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India.",
- "type": "UncategorizedText"
+ "text": "(http://creativecommons.org/licenses/by-nc-nd/4.0/).",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "26df7873600c047edc18648775de77ae",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007",
+ "type": "NarrativeText"
},
{
- "element_id": "d015eb225a0c464c1281688fd4a873f2",
+ "element_id": "7908ca49d8b815456f5785d535b93235",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -243,11 +347,24 @@
],
"page_number": 1
},
- "text": "E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni).",
+ "text": "n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India. E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni).",
"type": "ListItem"
},
{
- "element_id": "ca5a07d65b88b23bf94f92d1392a1f9b",
+ "element_id": "1d4ac78953564742e250d50dc9207822",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "https://doi.org/10.1016/j.dib.2018.12.055",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "5e4b10d5b4882e8af9496518545c64f8",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -256,7 +373,7 @@
],
"page_number": 1
},
- "text": "https://doi.org/10.1016/j.dib.2018.12.055 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).",
+ "text": "2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).",
"type": "NarrativeText"
},
{
@@ -286,7 +403,7 @@
"type": "Title"
},
{
- "element_id": "1d5dccc54143c991c5dac7f3eaf028e8",
+ "element_id": "0cc9334df550d1730f2d468941a38225",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -295,7 +412,7 @@
],
"page_number": 2
},
- "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired Tables, text files Artificially generated by a C þ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3]. Data format Experimental factors Experimental features Data source location Data accessibility Related research article",
+ "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data Tables, text files How data were acquired Artificially generated by a C þ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Data format Raw Experimental factors Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Experimental features Randomly generated instances Data source location IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data accessibility Data can be downloaded from https://orlib.uqcloud.net/ Related research article Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3].",
"type": "Table"
},
{
@@ -338,7 +455,7 @@
"type": "ListItem"
},
{
- "element_id": "3ef13ef244943d8c0c99a778ae8a177e",
+ "element_id": "79e2a2e0c24e1e8befe2b6beb2f1df64",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -347,11 +464,11 @@
],
"page_number": 2
},
- "text": "e All the problem instances are available for use without any restrictions. e The benchmark solutions and solution time for the problem instances are presented in [3] and can",
+ "text": "e All the problem instances are available for use without any restrictions.",
"type": "NarrativeText"
},
{
- "element_id": "f7503405f2d9e9be5fcdb000f0248741",
+ "element_id": "d401597b8ff2854bfb89f2833d02a763",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -360,7 +477,7 @@
],
"page_number": 2
},
- "text": "be used for the comparison.",
+ "text": "e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison.",
"type": "ListItem"
},
{
@@ -390,7 +507,7 @@
"type": "Title"
},
{
- "element_id": "3f75ae21ef4d8973948ca41f0329efb9",
+ "element_id": "683993fc4592941bf8b06173870aa63c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -399,11 +516,11 @@
],
"page_number": 2
},
- "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number ‘RN-8–1500-01.dat’, for is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net.",
+ "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm; nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:",
"type": "NarrativeText"
},
{
- "element_id": "950af9be0340541a956f6ffad3ecb178",
+ "element_id": "fc547df12bfc22e91a0b5927670caa78",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -412,8 +529,8 @@
],
"page_number": 2
},
- "text": "For each problem instance, the following information is provided: The number of depots mð The number of trips ðnÞ, The number of locations ðlÞ, The number of vehicles at each depot, For each trip i A 1; 2; …; n, a start time, ts",
- "type": "NarrativeText"
+ "text": "The number of depots mð",
+ "type": "Title"
},
{
"element_id": "320f6d28582c354d35673c2a4119851f",
@@ -429,7 +546,20 @@
"type": "UncategorizedText"
},
{
- "element_id": "38269e29d1bd6e68141cb8e14afb50e5",
+ "element_id": "6aeb7fbb6968ef22dc0adfbe09dbb58b",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 2
+ },
+ "text": "The number of trips ðnÞ,",
+ "type": "UncategorizedText"
+ },
+ {
+ "element_id": "8fac4e6fad19e30a40504920771062f8",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -438,11 +568,37 @@
],
"page_number": 2
},
- "text": "i , a start location, ls i , an end time, te i , and an end location, le i , and",
+ "text": "The number of locations ðlÞ,",
+ "type": "UncategorizedText"
+ },
+ {
+ "element_id": "37965175e9f553f4c05167c81d0984d6",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 2
+ },
+ "text": "The number of vehicles at each depot,",
+ "type": "UncategorizedText"
+ },
+ {
+ "element_id": "96ca028aef61c1fd98c9f0232a833498",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 2
+ },
+ "text": "For each trip i A 1; 2; …; n, a start time, ts i , an end time, te i , a start location, ls i , and an end location, le i , and",
"type": "NarrativeText"
},
{
- "element_id": "9dd8b3854d974abcb78d8980939f6df7",
+ "element_id": "448de3300a8c7e2cfdd2028dd0bb4171",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -455,7 +611,7 @@
"type": "NarrativeText"
},
{
- "element_id": "8d5157e6f596a6e1f4f9c031b8fedb5e",
+ "element_id": "b13807f59ac7c6647ee0aee74f9b0dd3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -468,7 +624,7 @@
"type": "ListItem"
},
{
- "element_id": "376bb2fb8e9604649d5074748810bb10",
+ "element_id": "db480e847a5703b19be6b79223e1ee03",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -481,7 +637,7 @@
"type": "NarrativeText"
},
{
- "element_id": "223fae9ea1c1eb30d7676d252e4a65da",
+ "element_id": "326c44638a881f86474b82cc244896f9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -533,7 +689,7 @@
"type": "NarrativeText"
},
{
- "element_id": "d0b1bc28aacb27456fe8ec060b4e5255",
+ "element_id": "53d11fbc182749dc1483c8ebf8100d2c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -542,11 +698,11 @@
],
"page_number": 3
},
- "text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at",
+ "text": "1. Each schedule should start and end at the same depot.",
"type": "ListItem"
},
{
- "element_id": "2fbe14c5bf022eba7d64cbb1d33b2bca",
+ "element_id": "383a0750affc0a1ff2274816ab83ccf9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -555,11 +711,11 @@
],
"page_number": 3
},
- "text": "the depot.",
+ "text": "2. Each trip should be covered by only one vehicle.",
"type": "ListItem"
},
{
- "element_id": "94c0ad90a1bb3711fa03b5fda69166c8",
+ "element_id": "b71e166a3fac487df44a3cb5d5bcc953",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -568,11 +724,24 @@
],
"page_number": 3
},
- "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.",
+ "text": "3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot.",
+ "type": "ListItem"
+ },
+ {
+ "element_id": "b7a3531443154b36c9c85ffa48647e05",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3
+ },
+ "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance.",
"type": "NarrativeText"
},
{
- "element_id": "3d57bb91b2a5543faae14c6309327eda",
+ "element_id": "9d3f44c51fe13ebdf6b9511859e4f1b7",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -581,11 +750,24 @@
],
"page_number": 3
},
- "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l",
+ "text": "For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.",
"type": "NarrativeText"
},
{
- "element_id": "13cb9ea3a2ad87f3e489c0f816d0a762",
+ "element_id": "4c1454f26189366f8014d0815ed3afc1",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3
+ },
+ "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots (m), the number of trips, (n), and the number of locations (I), in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, ie{1,...,n}, and provides the start location, the start time, the end location, and the end time of trip i. The next | lines present the travel times between any two locations, i,j e {1, wal}.",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "d9904b5393369c5204af83b64035802a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -598,7 +780,7 @@
"type": "NarrativeText"
},
{
- "element_id": "68f62c559d15d9cf62c956b66943ab71",
+ "element_id": "0db20c23a12e1b6eadee6eb8aecc17d8",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -607,11 +789,11 @@
],
"page_number": 3
},
- "text": "Table 1 Average number of locations, times, vehicles and empty travels for each instance size.",
- "type": "UncategorizedText"
+ "text": "Table 1",
+ "type": "Title"
},
{
- "element_id": "15bf6f769efd074ae328ba8a919fe673",
+ "element_id": "849ee5b486ae80522665b843341bd492",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -620,11 +802,24 @@
],
"page_number": 3
},
- "text": "Instance size (m, n) Average number of Locations Times Vehicles (8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000) 568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20 975.20 1048.00 1078.00 1113.20 994.00 1040.60 1081.00 1107.40 985.40 1040.60 1083.20 1101.60 652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60 668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60",
+ "text": "Average number of locations, times, vehicles and empty travels for each instance size.",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "63de709cd751564fc9622864af4e9310",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3
+ },
+ "text": "Instance size (m, n) Average number of Locations Times Vehicles (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 (16, 3000) 1087.20 1101.60 1284.60 2,684,983.60",
"type": "Table"
},
{
- "element_id": "f7847444eec1921a6618e0eb97d5ce40",
+ "element_id": "ec04cd3d411eed35515b3ea80ebac5af",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -650,7 +845,7 @@
"type": "Header"
},
{
- "element_id": "0a4152d3ee312a3d28cc2b63d6f59a6e",
+ "element_id": "ee868bdc4e792cd79b6ca4f1c953d18f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -659,11 +854,24 @@
],
"page_number": 4
},
- "text": "Table 2 Description of file format for each problem instance.",
+ "text": "Table 2",
"type": "Title"
},
{
- "element_id": "08f12f3a48acd91df47c88ee29818b25",
+ "element_id": "3b85fcba083782b79205b4eb3d36b000",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4
+ },
+ "text": "Description of file format for each problem instance.",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "17e17590003c0f514220c453f88da6b7",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -672,11 +880,11 @@
],
"page_number": 4
},
- "text": "Number of lines Number of columns in each line Description 1 1 n l 3 m 4 l The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j. i , the start i , the end location le",
+ "text": "Number of Number of columns in Description lines each line 1 3 The number of depots, the number of trips, and the number of locations. 1 m The number of vehicles rd at each depot d. n 4 One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls i , the start i , the end location le time ts i and the end time te i for the corresponding trip. l l Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j.",
"type": "Table"
},
{
- "element_id": "87a92f03858a6c0dca834e8d761fad1c",
+ "element_id": "42b81f7b374412677918314eb4c50b0b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -689,7 +897,7 @@
"type": "Title"
},
{
- "element_id": "33136db034d62e5e01ac821ec8b275c2",
+ "element_id": "56241511960379f96bdaa0db2d4143c4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -702,7 +910,7 @@
"type": "NarrativeText"
},
{
- "element_id": "1c846fccfb6d9af904cc70c5c7c1fb06",
+ "element_id": "188dbf85ea2874b6d7a24ba7d468c2df",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -715,7 +923,7 @@
"type": "NarrativeText"
},
{
- "element_id": "94bc0bbd2096d54a7d7c315043221933",
+ "element_id": "e8f33017d2ad6a495ed09fd0c2dd5771",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -728,7 +936,7 @@
"type": "Title"
},
{
- "element_id": "91032c8f702188685199114840d2d934",
+ "element_id": "c099a73c6a332ceab3b11d480690b6b4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -741,7 +949,7 @@
"type": "NarrativeText"
},
{
- "element_id": "2264d7dcff32783949952d865a0ac67f",
+ "element_id": "dd64556bf9fb6aec39ebcbec74ebe3a9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -754,7 +962,7 @@
"type": "Title"
},
{
- "element_id": "f58a55cffa2179242cb67a525fb43f66",
+ "element_id": "00311a8330f799ab5ee12c4c704ce0db",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -767,7 +975,7 @@
"type": "ListItem"
},
{
- "element_id": "9ec9dc8439ff48e46a1fbf6d07a53fe7",
+ "element_id": "b76af82ca59e2d4347ca8cc18fc37a0e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -780,7 +988,7 @@
"type": "ListItem"
},
{
- "element_id": "88901bd600ad2a39811f9a761ea63f14",
+ "element_id": "8d2a994da7c053f79017e9d2106d4680",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -793,7 +1001,7 @@
"type": "ListItem"
},
{
- "element_id": "19251c318ce218a75808ae503b2d66f3",
+ "element_id": "59ea62c54d1da68865c6c49121c3c469",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -806,7 +1014,7 @@
"type": "ListItem"
},
{
- "element_id": "2ec6f7b25d8beece707bbb57230fe77e",
+ "element_id": "0ca3ff6b5ba7ee78a40e64a79b2b1f5d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -819,7 +1027,7 @@
"type": "ListItem"
},
{
- "element_id": "d4665f68c9443a201143c62674372052",
+ "element_id": "806d3d417aa93116103226da5011f8e0",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
index 41310d470e..3641fcd434 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
@@ -13,7 +13,7 @@
"type": "Header"
},
{
- "element_id": "6e95de55fbc805ac11d5e168881e41eb",
+ "element_id": "0431d4983254ddadb5eaa405ce7c76bd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -22,11 +22,50 @@
],
"page_number": 1
},
- "text": "ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%, p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events.",
+ "text": "ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%,",
+ "type": "UncategorizedText"
+ },
+ {
+ "element_id": "714cf871095b12c7057d4f6a55255a8a",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns).",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "a245a973a58f08d82a93dad258eafd42",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration.",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "b42c4325c538fdf309d42c40ec604896",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "The timing of lurasidone administration could be considered in effort to minimize potential adverse events.",
"type": "NarrativeText"
},
{
- "element_id": "c0ad446ac0e663713724aa5f42d20448",
+ "element_id": "3722605e6b20dc38cd47d29a1f2f0bed",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -35,11 +74,11 @@
],
"page_number": 1
},
- "text": "S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NAÏVE, FIRST EPISODE PSYCHOSIS PATIENTS",
+ "text": "S6. SLEEP ENDOPHENOTYPES OF",
"type": "Title"
},
{
- "element_id": "21facf77763c3e990a3db1b8626c133a",
+ "element_id": "5bd04ba883701d50a02084cad2e6265f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -48,11 +87,63 @@
],
"page_number": 1
},
- "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy",
- "type": "UncategorizedText"
+ "text": "SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NAÏVE, FIRST EPISODE PSYCHOSIS PATIENTS",
+ "type": "Title"
+ },
+ {
+ "element_id": "6ee8eaddff375a9956442277027ac583",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "ffc023def05332d8cae421b02bdbd618",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "f5c9090c414f6bbea2710e1ff168fe5b",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding.",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "b7cea49bbbf0078693d6235d16fd58cc",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10).",
+ "type": "NarrativeText"
},
{
- "element_id": "26b6989522e94c2c7ef5c2633e41cf72",
+ "element_id": "d24821f87d0d03dbdc99084ad0fb5131",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -61,11 +152,11 @@
],
"page_number": 1
},
- "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC.",
+ "text": "Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC.",
"type": "NarrativeText"
},
{
- "element_id": "b38798d4ed1cda1c49ed2db924d40039",
+ "element_id": "32e2d561158fd20a749e2329cb9d94dc",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -78,7 +169,7 @@
"type": "Title"
},
{
- "element_id": "6681a3fc2e2bbc7efabbf221baaeec6b",
+ "element_id": "51cb7675de5ed12d4a6fd401e1fc993e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -91,7 +182,7 @@
"type": "Header"
},
{
- "element_id": "418368d1fe238e68fc6c8663f7485649",
+ "element_id": "d561af626afc1c4dd5d6082013fa8d00",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -104,7 +195,7 @@
"type": "NarrativeText"
},
{
- "element_id": "2693595cd6fc5be02dc752b089f85eea",
+ "element_id": "a79609d8a4ad2200e95940af512745b4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -117,7 +208,7 @@
"type": "NarrativeText"
},
{
- "element_id": "3f2d8de4445801a7562416267c06a877",
+ "element_id": "0eb46f899b8c3b96466eb49974ea3d5e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -130,7 +221,46 @@
"type": "NarrativeText"
},
{
- "element_id": "741c946db28df5068fb60063dad37d27",
+ "element_id": "994d1b085c8a959876a1da6226c5ab9c",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls.",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "8d852bceb180e277c2ccc099d18cd21a",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session.",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "3c344f295bbe8b591ab524cdd3e7f40e",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1
+ },
+ "text": "Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP",
+ "type": "NarrativeText"
+ },
+ {
+ "element_id": "9b8b20faafc260c3d1789767766466f5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -139,11 +269,11 @@
],
"page_number": 1
},
- "text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design.",
+ "text": "Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design.",
"type": "NarrativeText"
},
{
- "element_id": "c1543aee0d7efb59052757f7b83a70a9",
+ "element_id": "e779c8b7b98a2a4c2de2d4d65db49726",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -156,7 +286,7 @@
"type": "NarrativeText"
},
{
- "element_id": "5afb27a02de3e7a95c0f2fa442e32526",
+ "element_id": "ad58a94e747d9fe18e2550e58c54f6bc",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -169,7 +299,7 @@
"type": "NarrativeText"
},
{
- "element_id": "0d80b62dd72121dd5263df8605849cf4",
+ "element_id": "6a0290d48528f40c9c2288fddff94e3e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json
index 1d86fb9c6d..89a5b524bc 100644
--- a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json
@@ -1055,8 +1055,8 @@
},
{
"type": "NarrativeText",
- "element_id": "31ed39cf3f959ddf86d3eba65cb79a01",
- "text": "Nuclear energy and the risk of radiation is one of the most extreme cases in which perceived and actual risks have diverged. The fear of radiation, whilst pre- dating the Second World War, was firmly established by the debate on the potential impacts of low-dose radiation from the fallout from nuclear weapons testing in the early years of the Cold War. Radiation in many ways became linked with the mental imagery of nuclear war, playing an important role in increasing public concern about radiation and its health effects. There is a well-established discrepancy between fact-based risk assessments and public perception of different risks. This is very much the case with nuclear power, and this is clearly highlighted in Figure 1, with laypersons ranking nuclear power as the highest risk out of 30 activities and technologies, with experts ranking nuclear as 20th. In many ways, popular culture\u2019s depiction of radiation has played a role in ensuring that this discrepancy has remained, be it Godzilla, The Incredible Hulk, or The Simpsons, which regularly plays on the notion of radiation from nuclear power plants causing three-eyed fish, something that has been firmly rejected as unscientific.",
+ "element_id": "75f32a291a5cbb11d3183eac5fb426c3",
+ "text": "Nuclear energy and the risk of radiation is one of the most extreme cases in which perceived and actual risks have diverged. The fear of radiation, whilst pre- dating the Second World War, was firmly established by the debate on the potential impacts of low-dose radiation from the fallout from nuclear weapons testing in the early years of the Cold War. Radiation in many ways became linked with the mental imagery of nuclear war, playing an important role in increasing public concern about radiation and its health effects. There is a well-established discrepancy between fact-based risk assessments and public perception of different risks. This is very much the case with nuclear power, and this is clearly highlighted in Figure 1, with laypersons ranking nuclear power as the highest risk out of 30 activities and technologies, with experts ranking nuclear as 20th. In many ways, popular culture\u2019s depiction of radiation has played a role in ensuring that this discrepancy has remained,",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1117,8 +1117,8 @@
},
{
"type": "Title",
- "element_id": "a66214340855880a5393384d1363511c",
- "text": "Rank Order Laypersons",
+ "element_id": "bf248ce5194cc4686f97a2769cd9744a",
+ "text": "Rank Order",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1179,8 +1179,132 @@
},
{
"type": "Table",
- "element_id": "9512f477364e1da1fa60dbd237c41f85",
- "text": "Experts 1 20 Nuclear power Motor vehicles 2 1 4 3 Handguns 2 4 Smoking Electric power (non-nuclear) 9 17 22 7 X-rays 25 30 Vaccinations",
+ "element_id": "fe081bc8fc80f0df977f46493f0e9430",
+ "text": "Laypersons Experts 1 Nuclear power 20 2 Motor vehicles 1 3 Handguns 4 4 Smoking 2 17 Electric power (non-nuclear) 9 22 X-rays 7 30 Vaccinations 25",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "FigureCaption",
+ "element_id": "a6b2ef41c3420b21165799903ccece40",
+ "text": "Figure 1. Ordering of perceived risks for 30 activities and technologies1,iii",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "6a806ec2935a7f5da409a3f892006982",
+ "text": "be it Godzilla, The Incredible Hulk, or The Simpsons, which regularly plays on the notion of radiation from nuclear power plants causing three-eyed fish, something that has been firmly rejected as unscientific.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1241,7 +1365,7 @@
},
{
"type": "NarrativeText",
- "element_id": "3ff36869cefb14183f0955094a908fc5",
+ "element_id": "b794118cfefdf05953a1eab3ecb49928",
"text": "In reality, radiation is a natural part of life; indeed, we are all exposed to radiation every day, on average receiving 2-3 millisieverts (mSv) per year. Most of this radiation is naturally occurring, with radon gas from the ground being the main source of exposure. The nuclear industry is responsible for a very small part of radiation exposure to the public, as seen in Figure 2. To put this into perspective, eating 10 bananas or two Brazil nuts results in the same radiation dose as living nearby a nuclear power plant for a year. Humans are also naturally radioactive, and the radiation dose from sleeping next to someone else each night for a year is ten times higher than the exposure from living nearby a nuclear power plant for the same time span.",
"metadata": {
"filetype": "application/pdf",
@@ -1303,7 +1427,7 @@
},
{
"type": "NarrativeText",
- "element_id": "773667f6d4fbe19cc347ace06ca3664e",
+ "element_id": "60d4e572d6bfa2687304ba3b28f12fcf",
"text": "In fact, scientific consensus is that when it comes to preventing exposure to radiation, nuclear power is much better than other electricity generators. A 2016 reportiii from the United Nations Scientific Committee on the Effects of Atomic Radiation (UNSCEAR) found that coal-generated electricity is responsible for more than half of the total global radiation exposure arising from electricity generation, while nuclear power contributed less than a fifth. Coal miners received high occupational exposure and workers in solar and wind farms received the highest occupational exposure associated with plant construction for the same amount of installed capacity.",
"metadata": {
"filetype": "application/pdf",
@@ -1365,7 +1489,7 @@
},
{
"type": "NarrativeText",
- "element_id": "6dab7cb99fa308838e8c0413caccb7f1",
+ "element_id": "9600ead8e685b080a9ae84455b6be4f9",
"text": "1 The original study was published in 1978, but its findings have been confirmed by numerous studies since.",
"metadata": {
"filetype": "application/pdf",
@@ -1427,8 +1551,8 @@
},
{
"type": "Image",
- "element_id": "57a9b2172894596e88b48caac276416d",
- "text": "Natural Artificial 48% Radon 14% Buildings & soil 12% Food & water 10% Cosmic 4% Thoron 11% Medicine 0.4% 0.4% Miscellaneous 0.2% Occupational 0.04% Nuclear discharges Fallout ",
+ "element_id": "695e4e06071f6ed026e30f329659adff",
+ "text": "Natural Artificial 48% Radon 11% Medicine 14% Buildings & soil 0.4% Fallout 12% Food & water 0.4% Miscellaneous 10% Cosmic 0.2% Occupational 4% Thoron 0.04% Nuclear discharges ",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1736,9 +1860,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "6c9fe7851d0f4e06c5ec939f53dbce3b",
- "text": "r a e y",
+ "type": "Title",
+ "element_id": "a8706e82b3f90cffc996a24348e3b670",
+ "text": "r",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1799,8 +1923,8 @@
},
{
"type": "Title",
- "element_id": "a70b649d3f49fafd8a15a6617364bd69",
- "text": "W T",
+ "element_id": "da631c23500655c51b9311a61f55744f",
+ "text": "a",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1860,9 +1984,1063 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "26963be98ae7ff2e8c0428862d074cf6",
- "text": "r e p s e i t i l",
+ "type": "Title",
+ "element_id": "d78a11e9e55235934c3a4922053c68e5",
+ "text": "e",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "8d14df8b7fd7744365fbf8e02d69415a",
+ "text": "y",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "f4df01bee1b8ffb973ac8539649c5189",
+ "text": "W",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "b733cf49de269e22bed7c9883b958669",
+ "text": "T",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "c4b47d788b26c3d5c62ad462ed3ca2db",
+ "text": "r",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "bff4435574259239761670b31432cc8a",
+ "text": "e",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "8ba15a3a71eb0bb689c582098cce6730",
+ "text": "p",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "5fde097ba00ad7647206ae11c721d28c",
+ "text": "s",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "81331ee9da4145c2651d6483696fe966",
+ "text": "8",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "81f1f3b9da6df38d938bf7871fa069b5",
+ "text": "e",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "aa4a79651a9a0087b66fcc40a2213113",
+ "text": "i",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "6d1c0d05d3a424b43d9572188a76c2d4",
+ "text": "t",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "392a17b2f3eba46f4bcf078e0b204514",
+ "text": "i",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "d24a9a771e46fdd6b269f1ecaf0b5eec",
+ "text": "l",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "9dc4537afa8ae0b959a542f9ba5c1e03",
+ "text": "S",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "919dac2487a4c860747318a132a54a72",
+ "text": "a",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "04ee5d05c3fcfffd945762e803478600",
+ "text": "t",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
+ "record_locator": {
+ "file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
+ },
+ "date_created": "1718723636.34",
+ "date_modified": "1676196572.0",
+ "permissions_data": [
+ {
+ "id": "18298851591250030956",
+ "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
+ "emailAddress": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "04774006893477068632",
+ "displayName": "ryan",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
+ "emailAddress": "ryan@unstructured.io",
+ "role": "writer",
+ "deleted": false,
+ "pendingOwner": false
+ },
+ {
+ "id": "anyoneWithLink",
+ "type": "anyone",
+ "kind": "drive#permission",
+ "role": "reader",
+ "allowFileDiscovery": false
+ },
+ {
+ "id": "09147371668407854156",
+ "displayName": "roman",
+ "type": "user",
+ "kind": "drive#permission",
+ "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
+ "emailAddress": "roman@unstructured.io",
+ "role": "owner",
+ "deleted": false,
+ "pendingOwner": false
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "63dabde368e2cf310d20a885fe50314a",
+ "text": "a",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1923,8 +3101,8 @@
},
{
"type": "Title",
- "element_id": "b69c60ea2e3fa24e25a069f90ee4b696",
- "text": "a t a F",
+ "element_id": "796538927664e4d87312c428469428f5",
+ "text": "F",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1985,7 +3163,7 @@
},
{
"type": "FigureCaption",
- "element_id": "af241d12aef0f51bace400db4e14649d",
+ "element_id": "d1496d2dc28f6877646e280c0c47e9ab",
"text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3",
"metadata": {
"filetype": "application/pdf",
@@ -2047,7 +3225,7 @@
},
{
"type": "NarrativeText",
- "element_id": "1e41836d6d7be638be9c0de0ce2c2256",
+ "element_id": "76619db169f10599a1fb73a13fdebafb",
"text": "Contrary to perceptions, nuclear is an incredibly safe source of energy (see Figure 3 for a comparison). What is also clear is that the continued use of alternative energy sources in preference to nuclear energy \u2013 in particular fossil fuels \u2013 poses a far greater risk to public health by significantly contributing to climate change and air pollution.",
"metadata": {
"filetype": "application/pdf",
@@ -2109,7 +3287,7 @@
},
{
"type": "ListItem",
- "element_id": "59b04b23d82c7c013abd6477a14c9425",
+ "element_id": "ffdd87176fddbe78d853186bf86602ea",
"text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. 3 Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the 2012 UNSCEAR report and the 2015 US NRC SOARCA study.",
"metadata": {
"filetype": "application/pdf",
@@ -2171,7 +3349,7 @@
},
{
"type": "Header",
- "element_id": "9a8b3b64f6d252a6d31d87d306952ca2",
+ "element_id": "8d78bc83fa3b857721631a0491d6039b",
"text": "3",
"metadata": {
"filetype": "application/pdf",
@@ -2977,8 +4155,8 @@
},
{
"type": "Image",
- "element_id": "dc83c2d2395c30a8785a2533424f1c72",
- "text": "Plant-level production costs at market prices Grid-level costs of the electricity system Social and environmental costs of emissions, land-use, climate change, security of supply, etc. ",
+ "element_id": "96aca413098163140d5213641ae01231",
+ "text": "Plant-level Social and production costs Grid-level costs environmental costs of at market prices of the electricity emissions, land-use, system climate change, security of supply, etc. ",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -4279,8 +5457,8 @@
},
{
"type": "ListItem",
- "element_id": "a3be0ee530629ee7a2413f05eb0cce76",
- "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712",
+ "element_id": "2ab37467d413d491735b002a679afdb8",
+ "text": "ii BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -4465,8 +5643,8 @@
},
{
"type": "ListItem",
- "element_id": "4ca71e69090af4ad16216a0ddcc0a168",
- "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018",
+ "element_id": "f8c502221064df965c932a0b76e0717b",
+ "text": "v International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -4527,8 +5705,8 @@
},
{
"type": "ListItem",
- "element_id": "16bb7fca4ab44ffc73e847ea7b93fc4d",
- "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8",
+ "element_id": "deb6d3d9e6eae5a2256fbc12db133555",
+ "text": "vi Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8",
"metadata": {
"filetype": "application/pdf",
"languages": [
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
index 94be393544..8c3c0f6ae6 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -1,8 +1,8 @@
[
{
"type": "UncategorizedText",
- "element_id": "d3ce55f220dfb75891b4394a18bcb973",
- "text": "1 2 0 2",
+ "element_id": "04fa31034847cbbf6c840f4da683ccf8",
+ "text": "1",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -22,9 +22,9 @@
}
},
{
- "type": "Header",
- "element_id": "d8294655784148f3059eb08db918977c",
- "text": "n u J 1 2 ] V C . s c [",
+ "type": "UncategorizedText",
+ "element_id": "f5f58f76a61e94f7fe903b999a570c72",
+ "text": "2021",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -45,8 +45,52 @@
},
{
"type": "UncategorizedText",
- "element_id": "c0cdc594eccc53cfb75eeef0ad75b65b",
- "text": "2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a",
+ "element_id": "fc05a198b2ff732119edea8986775994",
+ "text": "2",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "4a90480c2297c31b4d7ad43b0801ae98",
+ "text": "0",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Header",
+ "element_id": "e3a383b7e9439f39773c13ea769297b7",
+ "text": "2 n u J 1 2 ] V C . s c [ 2 v 8 4 3 5 1 . 3 0 1 2 :",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -67,7 +111,139 @@
},
{
"type": "Title",
- "element_id": "4467e9baee9456824c1aa679526f6979",
+ "element_id": "4608f9aa33a0cab158565817b0d15743",
+ "text": "v",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "6f69e5f921907e689f1a52bd84282b31",
+ "text": "arXiv",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "ed4e590932b333f40d0e1367b6b0e32e",
+ "text": "i",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "8cb024fb60457b7c572b167801037f75",
+ "text": "X",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "c202bdacd2daf4c52fa3a6ddd64a0728",
+ "text": "r",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "3db474893ec321c81ef9d1a2afd5f660",
+ "text": "a",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "d3be9e3d661e2a79f37257caa5b54d8c",
"text": "LayoutParser: A Uni\ufb01ed Toolkit for Deep Learning Based Document Image Analysis",
"metadata": {
"filetype": "application/pdf",
@@ -89,7 +265,7 @@
},
{
"type": "NarrativeText",
- "element_id": "33dff5d4db499a435f61220a890d3f04",
+ "element_id": "7cf062c1ba64938cc68c4fae61506d84",
"text": "Zejiang Shen! (4), Ruochen Zhang\u201d, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson\u2019, and Weining Li>",
"metadata": {
"filetype": "application/pdf",
@@ -111,7 +287,7 @@
},
{
"type": "NarrativeText",
- "element_id": "a0bbb18d9710661eb9e2aa6e651e6555",
+ "element_id": "23b8def20ce16f929d4f558b2a19f200",
"text": "1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca",
"metadata": {
"filetype": "application/pdf",
@@ -133,7 +309,7 @@
},
{
"type": "NarrativeText",
- "element_id": "ea0cc468a56f4af65b968fb86913bdfe",
+ "element_id": "f1169388c7749db52e388e2fe4feaec6",
"text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model con\ufb01gurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going e\ufb00orts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.",
"metadata": {
"filetype": "application/pdf",
@@ -155,7 +331,7 @@
},
{
"type": "NarrativeText",
- "element_id": "370e1b61d1dab8ae35d62eb6f42feceb",
+ "element_id": "caffc7480fdd82a089ae387e01aabdb9",
"text": "Keywords: Document Image Analysis \u00b7 Deep Learning \u00b7 Layout Analysis \u00b7 Character Recognition \u00b7 Open Source library \u00b7 Toolkit.",
"metadata": {
"filetype": "application/pdf",
@@ -177,7 +353,7 @@
},
{
"type": "Title",
- "element_id": "f12febfe29a59a8e4ce6b3494d6deb8a",
+ "element_id": "bcb94891b0d7a997ab7e28d99195ff37",
"text": "Introduction",
"metadata": {
"filetype": "application/pdf",
@@ -199,7 +375,7 @@
},
{
"type": "NarrativeText",
- "element_id": "fd81374ba214b5472d0b60b2371ae8df",
+ "element_id": "8de96d1e80af35f9b6954252e14c2caf",
"text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\ufb01cation [11,",
"metadata": {
"filetype": "application/pdf",
@@ -594,9 +770,141 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "e55d055f6205d93e21c673d749264e7a",
- "text": "6 The number shown is obtained by specifying the search type as \u2018code\u2019. 7 https://ocr-d.de/en/about 8 https://github.com/BobLd/DocumentLayoutAnalysis 9 https://github.com/leonlulu/DeepLayout 10 https://github.com/hpanwar08/detectron2 11 https://github.com/JaidedAI/EasyOCR 12 https://github.com/PaddlePaddle/PaddleOCR",
+ "type": "Footer",
+ "element_id": "b1fa4bbd1bdda08489faab5bf3adf5cc",
+ "text": "6 The number shown is obtained by specifying the search type as \u2018code\u2019.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "db639db124b6064248de0c0dc71510a4",
+ "text": "7 https://ocr-d.de/en/about",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "d881ce84f017d89f6e35e2bc4b133bfc",
+ "text": "8 https://github.com/BobLd/DocumentLayoutAnalysis",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "9b96c128deddda1a32c739a2df157496",
+ "text": "9 https://github.com/leonlulu/DeepLayout",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "5cf72e821375f4480a1529bef97608ef",
+ "text": "10 https://github.com/hpanwar08/detectron2",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "4ab94e79eedc3a7ac498aaf737ca8878",
+ "text": "11 https://github.com/JaidedAI/EasyOCR",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "460b163c13ad7cad4fce325820a76481",
+ "text": "12 https://github.com/PaddlePaddle/PaddleOCR",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -837,8 +1145,8 @@
},
{
"type": "Table",
- "element_id": "2a62c55be8401908c18140e858ec3345",
- "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scienti\ufb01c documents Layouts of scanned modern magazines and scienti\ufb01c reports Layouts of scanned US newspapers from the 20th century Table region on modern scienti\ufb01c and business document Layouts of history Japanese documents",
+ "element_id": "cb534ba64da736dc53d60b660f5e1153",
+ "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] F / M M Layouts of modern scienti\ufb01c documents PRImA [3] M - Layouts of scanned modern magazines and scienti\ufb01c reports Newspaper [17] F - Layouts of scanned US newspapers from the 20th century TableBank [18] F F Table region on modern scienti\ufb01c and business document HJDataset [31] F / M - Layouts of history Japanese documents",
"metadata": {
"text_as_html": "
Dataset | | Base Model'| | | Notes |
---|
PubLayNet B8]| | F/M | Layouts of modern scientific documents |
PRImA | M | Layouts of scanned modern magazines and scientific report |
Newspaper | F | Layouts of scanned US newspapers from the 20th century |
TableBank | F | Table region on modern scientific and business document |
HJDataset | F/M | Layouts of history Japanese documents |
",
"filetype": "application/pdf",
@@ -948,8 +1256,8 @@
},
{
"type": "NarrativeText",
- "element_id": "a25137fdc995e079684174269dc0effa",
- "text": "1 import layoutparser as lp 2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel ( \" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" ) 4 5 layout = model . detect ( image )",
+ "element_id": "2f41c1732a2870b1fecd72dec1b2ff3d",
+ "text": "1 import layoutparser as lp 2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel ( 4 \" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" ) 5 layout = model . detect ( image )",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1433,8 +1741,30 @@
},
{
"type": "NarrativeText",
- "element_id": "97ec9aa9ca6081a954acf13151c9239e",
- "text": "The end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into di\ufb00erent formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-speci\ufb01c formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5). Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in di\ufb00erent modes. More detailed information can be found in the online LayoutParser documentation page.",
+ "element_id": "afa5f1dc8b4ce5598f278992d818eaa9",
+ "text": "The end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into di\ufb00erent formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-speci\ufb01c formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5).",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 8,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "28aeb996f497c9d01d06e564483d0854",
+ "text": "Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in di\ufb00erent modes. More detailed information can be found in the online LayoutParser documentation page.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1455,7 +1785,7 @@
},
{
"type": "Title",
- "element_id": "b3c9f96506599f418cc474db4adb5b0d",
+ "element_id": "9e8599877fa8025a800477652dcd29be",
"text": "3.5 Customized Model Training",
"metadata": {
"filetype": "application/pdf",
@@ -1477,7 +1807,7 @@
},
{
"type": "NarrativeText",
- "element_id": "1a011956c708d01abb2c058ec28c126f",
+ "element_id": "05e5f4e2a196db34263541d1ecebe297",
"text": "Besides the o\ufb00-the-shelf library, LayoutParser is also highly customizable with supports for highly unique and challenging document analysis tasks. Target document images can be vastly di\ufb00erent from the existing datasets for train- ing layout models, which leads to low layout detection accuracy. Training data",
"metadata": {
"filetype": "application/pdf",
@@ -1499,7 +1829,7 @@
},
{
"type": "NarrativeText",
- "element_id": "05f6e98c538d1912459b1e568871e6c2",
+ "element_id": "894921dce9d1291116c38d561c2fff59",
"text": "14 https://altoxml.github.io",
"metadata": {
"filetype": "application/pdf",
@@ -1895,8 +2225,8 @@
},
{
"type": "NarrativeText",
- "element_id": "07a7dcb89bd4b78209da5f28e2877a1a",
- "text": "The digitization of historical documents can unlock valuable data that can shed light on many important social, economic, and historical questions. Yet due to scan noises, page wearing, and the prevalence of complicated layout structures, ob- taining a structured representation of historical document scans is often extremely complicated. In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese \ufb01rm \ufb01nancial ta- bles with complicated layouts. The pipeline applies two layout models to identify di\ufb00erent levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.",
+ "element_id": "083b7889c33f34e7d0479c233cdccc34",
+ "text": "The digitization of historical documents can unlock valuable data that can shed light on many important social, economic, and historical questions. Yet due to scan noises, page wearing, and the prevalence of complicated layout structures, ob- taining a structured representation of historical document scans is often extremely complicated.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1916,9 +2246,9 @@
}
},
{
- "type": "Image",
- "element_id": "02890858b0555e354a0336addbf54a7e",
- "text": "(spe peepee, \u2018Active Learning Layout Annotate Layout Dataset | + \u2018Annotation Toolkit \u00a5 a Deep Leaming Layout Model Training & Inference, \u00a5 ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <\u2014\u2014 Default ane Customized \u00a5 ee Layout Structure Visualization & Export | <\u2014\u2014 | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules",
+ "type": "NarrativeText",
+ "element_id": "76dd07abeb9f4bbcb77152deb52c9dc0",
+ "text": "In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese \ufb01rm \ufb01nancial ta- bles with complicated layouts. The pipeline applies two layout models to identify di\ufb00erent levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1959,9 +2289,31 @@
}
}
},
+ {
+ "type": "Image",
+ "element_id": "f48a844114951222f6c96331efc683fb",
+ "text": "(spe peepee, \u2018Active Learning Layout Annotate Layout Dataset | + \u2018Annotation Toolkit \u00a5 a Deep Leaming Layout Model Training & Inference, \u00a5 ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <\u2014\u2014 Default ane Customized \u00a5 ee Layout Structure Visualization & Export | <\u2014\u2014 | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 11,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
{
"type": "NarrativeText",
- "element_id": "e55432edc0ceae5fae7bedae3bd560c6",
+ "element_id": "3c1fd89a3436d3cedb4d22d297c76437",
"text": "Fig. 5: Illustration of how LayoutParser helps with the historical document digi- tization pipeline.",
"metadata": {
"filetype": "application/pdf",
@@ -1983,7 +2335,7 @@
},
{
"type": "ListItem",
- "element_id": "17e31eed33c4abb58af55aa5c6e0b7b0",
+ "element_id": "c5b22d5a9f8b657ad4acdf6ad1f0bdd0",
"text": "15 A document page consists of eight rows like this. For simplicity we skip the row segmentation discussion and refer readers to the source code when available.",
"metadata": {
"filetype": "application/pdf",
@@ -2620,9 +2972,31 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "8605ad66ac8429ae6e92841d5026f0de",
- "text": "Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) Lukasz Garncarek, Powalski, R., Stanistawek, T., Topolski, B., Halama, P., Graliriski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)",
+ "type": "ListItem",
+ "element_id": "a742cc226eba47ed37993cde0d2718d9",
+ "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018)",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 15,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "0176491ee2584bffdfb943caa8aefab4",
+ "text": "Lukasz Garncarek, Powalski, R., Stanistawek, T., Topolski, B., Halama, P., Graliriski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2643,7 +3017,7 @@
},
{
"type": "ListItem",
- "element_id": "39972987462975e72ff97f3cc3d28223",
+ "element_id": "95bc71fb3542f420dfa50e22eb8c734f",
"text": "[10] Graves, A., Fern\u00b4andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classi\ufb01cation: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369\u2013376 (2006)",
"metadata": {
"filetype": "application/pdf",
@@ -2664,9 +3038,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "559ea792f7c0c98e4af9e3436774efa9",
- "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classi\ufb01cation and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991\u2013995. IEEE (2015) [12] He, K., Gkioxari, G., Doll\u00b4ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the",
+ "type": "ListItem",
+ "element_id": "3fab75481d8e6d389ea6034e18f54e00",
+ "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classi\ufb01cation and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991\u2013995. IEEE (2015)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2687,8 +3061,8 @@
},
{
"type": "ListItem",
- "element_id": "a25accb47954c56b35a06609449901ef",
- "text": "IEEE international conference on computer vision. pp. 2961\u20132969 (2017)",
+ "element_id": "8cd8821b71e4bda1a77f6a114ff54f50",
+ "text": "[12] He, K., Gkioxari, G., Doll\u00b4ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961\u20132969 (2017)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2709,7 +3083,7 @@
},
{
"type": "ListItem",
- "element_id": "616320116770187bb631e2bcabdc44fe",
+ "element_id": "02c0a0c6c60503798f3894fe244c237d",
"text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)",
"metadata": {
"filetype": "application/pdf",
@@ -2731,7 +3105,7 @@
},
{
"type": "ListItem",
- "element_id": "8ead02f7720d59492ca67a5cfddd4552",
+ "element_id": "893390a7c02886a034be490296237e30",
"text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J. 2007(159), 2 (Jul 2007)",
"metadata": {
"filetype": "application/pdf",
@@ -2753,7 +3127,7 @@
},
{
"type": "ListItem",
- "element_id": "ccf2aef698df297baac645bfbe87b5a4",
+ "element_id": "bd2e9f3795d8492cadde716193f62aba",
"text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42\u201347. IEEE (2011)",
"metadata": {
"filetype": "application/pdf",
@@ -2775,7 +3149,7 @@
},
{
"type": "ListItem",
- "element_id": "7303875a4141fe55ab6c6538d2660269",
+ "element_id": "07cef8a161dd1c3f0895c605844d678e",
"text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120\u2013122. UIST \u201920 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143",
"metadata": {
"filetype": "application/pdf",
@@ -2797,7 +3171,7 @@
},
{
"type": "ListItem",
- "element_id": "484bdc79ca505343715e3d177bd17275",
+ "element_id": "90ad04faa055039bfd37c1a851878048",
"text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055\u20133062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767",
"metadata": {
"filetype": "application/pdf",
@@ -2819,7 +3193,7 @@
},
{
"type": "ListItem",
- "element_id": "d3a921d79a30615dcf174c93d2da8d4d",
+ "element_id": "dfcf2fc9f58128e98ba312b0c89fbea1",
"text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)",
"metadata": {
"filetype": "application/pdf",
@@ -2841,7 +3215,7 @@
},
{
"type": "ListItem",
- "element_id": "8eea8c964496b9e3de3099a9af798467",
+ "element_id": "b5e16aae3d43919bb5899fade72c0550",
"text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00b4ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740\u2013755. Springer (2014)",
"metadata": {
"filetype": "application/pdf",
@@ -2863,7 +3237,7 @@
},
{
"type": "ListItem",
- "element_id": "de8aee29b21c13139f4875a90a52d0a0",
+ "element_id": "8344e54a6acb25643c83b5ea96c5c593",
"text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431\u20133440 (2015)",
"metadata": {
"filetype": "application/pdf",
@@ -2885,7 +3259,7 @@
},
{
"type": "ListItem",
- "element_id": "bce47bb8dec257c966d948be79e80094",
+ "element_id": "9476b030857c32e55a638928df6d01e8",
"text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Su\ufb01, S., Williams, A., Wolsten- croft, K.: An experimental work\ufb02ow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161\u2013168 (2011)",
"metadata": {
"filetype": "application/pdf",
@@ -2907,7 +3281,7 @@
},
{
"type": "ListItem",
- "element_id": "7a372cbcf79efc9cc23d35644816ca15",
+ "element_id": "4640c3f33351b994165071b6d872ef56",
"text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7\u201312. IEEE (2018)",
"metadata": {
"filetype": "application/pdf",
@@ -2950,9 +3324,53 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "e5e88c91dcc8703ef7ffaf69fe565020",
- "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic di\ufb00erentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257\u2013260. IEEE (2010)",
+ "type": "ListItem",
+ "element_id": "048415c6e5fc7bdd5466bf9c877b4a14",
+ "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic di\ufb00erentiation in pytorch (2017)",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 16,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "04c0655f0749575bbe838891bf103d6d",
+ "text": "[24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019)",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 16,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "a2f34eceb4f6036f105c6319de5450d1",
+ "text": "[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257\u2013260. IEEE (2010)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2973,7 +3391,7 @@
},
{
"type": "ListItem",
- "element_id": "a647b5ee9dfd11735b912b0510f476a1",
+ "element_id": "c81432ac5c76b82c1ccd93d0a3ee15b1",
"text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572\u2013573 (2020)",
"metadata": {
"filetype": "application/pdf",
@@ -2995,7 +3413,7 @@
},
{
"type": "ListItem",
- "element_id": "70a42a501297733d90dbcae55dbc2b78",
+ "element_id": "0f5cebf6a7661981062a59f24e0b2a3a",
"text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142\u2013147. IEEE (2019)",
"metadata": {
"filetype": "application/pdf",
@@ -3017,7 +3435,7 @@
},
{
"type": "ListItem",
- "element_id": "3d9af66828b6b1e385e04dcad340e403",
+ "element_id": "d02327f415141694d5853b57ac0f9e3f",
"text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91\u201399 (2015)",
"metadata": {
"filetype": "application/pdf",
@@ -3038,9 +3456,31 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "ff7c339e3258376076b2f515c6b0f01e",
- "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61\u201380 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162\u20131167. IEEE (2017)",
+ "type": "ListItem",
+ "element_id": "d0529ef231eeac2e8ae2083dee416210",
+ "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61\u201380 (2008)",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 16,
+ "data_source": {
+ "record_locator": {
+ "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+ },
+ "permissions_data": [
+ {
+ "mode": 33188
+ }
+ ]
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "98fce7a2720ed7eda87a02659071b121",
+ "text": "[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162\u20131167. IEEE (2017)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -3061,7 +3501,7 @@
},
{
"type": "ListItem",
- "element_id": "410d64198e29b695d48db2cd3781daae",
+ "element_id": "e3146a202c282ecab0d87f59d3307983",
"text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548\u2013549 (2020)",
"metadata": {
"filetype": "application/pdf",
@@ -3083,7 +3523,7 @@
},
{
"type": "ListItem",
- "element_id": "fc8457575ed11e22f45c936aba277303",
+ "element_id": "7b2d2fbb2bcae74fae3cf85c7478eb9f",
"text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning based layout annotation. arXiv preprint arXiv:2010.01762 (2020)",
"metadata": {
"filetype": "application/pdf",
@@ -3105,7 +3545,7 @@
},
{
"type": "ListItem",
- "element_id": "b66f47222b34c59b619b0f90b165b093",
+ "element_id": "7937fc115bcbbc8c08640587fa5ed827",
"text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720\u2013725. IEEE (2019)",
"metadata": {
"filetype": "application/pdf",
@@ -3126,9 +3566,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "93eb7c029c0a6d8353aba82f5f2d389d",
- "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface\u2019s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) [35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://",
+ "type": "ListItem",
+ "element_id": "881f67b82dccc13eaf96e912750c0318",
+ "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface\u2019s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -3149,8 +3589,8 @@
},
{
"type": "ListItem",
- "element_id": "ba70589bb3f48ccf6e18724702cc1f10",
- "text": "github.com/facebookresearch/detectron2 (2019)",
+ "element_id": "71c1e09e0ae75ac750aaf4bfb71539d5",
+ "text": "[35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https:// github.com/facebookresearch/detectron2 (2019)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -3171,7 +3611,7 @@
},
{
"type": "ListItem",
- "element_id": "a8ce4311d30f1f7cba9043e30c9ad6d1",
+ "element_id": "f28bdd6f783474abbbbb57c24978a1ff",
"text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)",
"metadata": {
"filetype": "application/pdf",
@@ -3193,7 +3633,7 @@
},
{
"type": "ListItem",
- "element_id": "a9acaa0d527f89ed3f3c7daac7694a23",
+ "element_id": "8b9c717003c9c36fc9833b6226aef9a3",
"text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of text and layout for document image understanding (2019)",
"metadata": {
"filetype": "application/pdf",
@@ -3215,8 +3655,8 @@
},
{
"type": "ListItem",
- "element_id": "b0e2d232fd257ee8ca691ff77b74fcee",
- "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- In: 2019 International Conference on Document IEEE (Sep 2019). ument Analysis and Recognition (ICDAR). pp. 1015\u20131022. https://doi.org/10.1109/ICDAR.2019.00166 layout analysis.",
+ "element_id": "3ac304a6df305ec0a0bb9079795b6c2e",
+ "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015\u20131022. IEEE (Sep 2019). https://doi.org/10.1109/ICDAR.2019.00166",
"metadata": {
"filetype": "application/pdf",
"languages": [
diff --git a/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json
index c170325917..0f55c5d712 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json
@@ -26,8 +26,8 @@
},
{
"type": "Title",
- "element_id": "8466f1c7e05ce04838ff95211c4fff50",
- "text": "WORLD ECONOMIC OUTLOOK UPDATE Inflation Peaking amid Low Growth",
+ "element_id": "ba69012a8fb5d70fcf3e58b80b42b1ce",
+ "text": "WORLD",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -51,7 +51,107 @@
},
{
"type": "Title",
- "element_id": "04fca18cc5aea2fdb24b55c01f4fa968",
+ "element_id": "464e483f73e708f251dd997141ca910a",
+ "text": "ECONOMIC",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "bd1b383b1523f7a038f369fcab02f861",
+ "text": "OUTLOOK",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "1ee4c1d4ef02f1368001b9557fc4650c",
+ "text": "UPDATE",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "359641a904473c37f58769cc8a1d9c36",
+ "text": "Inflation Peaking amid Low Growth",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "a27cc1b76f2648a54e6469d4fd57202d",
"text": "2023 JAN",
"metadata": {
"filetype": "application/pdf",
@@ -76,7 +176,7 @@
},
{
"type": "Image",
- "element_id": "e8910cbfc6833d5ca117621c22a183e5",
+ "element_id": "3cbcac684b7f5dfb187b5d3d313f30a9",
"text": "",
"metadata": {
"filetype": "application/pdf",
@@ -201,8 +301,8 @@
},
{
"type": "ListItem",
- "element_id": "6d8185901672f642fb852b8b77c2f244",
- "text": "tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment.",
+ "element_id": "dd461967e22f3ebdfc8cf78173c94018",
+ "text": "In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -401,8 +501,58 @@
},
{
"type": "NarrativeText",
- "element_id": "2951e1eb514a453e5813bc6889f759f5",
- "text": "Monetary policy starts to bite. Signs are apparent that monetary policy tightening is starting to cool demand and inflation, but the full impact is unlikely to be realized before 2024. Global headline inflation appears to have peaked in the third quarter of 2022 (Figure 1). Prices of fuel and nonfuel commodities have declined, lowering headline inflation, notably in the United States, the euro area, and Latin America. But underlying (core) inflation has not yet peaked in most economies and remains well above pre-pandemic levels. It has persisted amid second-round effects from earlier cost shocks and tight labor markets with robust wage growth as consumer demand has remained resilient. Medium-term inflation expectations generally remain anchored, but some gauges are up. These developments have caused central banks to raise rates faster than expected, especially in the United States and the euro area, and to signal that rates will stay elevated for longer. Core inflation is declining in some economies that have completed their tightening cycle\u2014such as Brazil. Financial markets are displaying high sensitivity to inflation news, with equity markets rising following recent releases of lower inflation data in anticipation of interest rate cuts (Box 1), despite central banks\u2019 communicating their resolve to tighten policy further. With the peak in US headline inflation and an acceleration in rate hikes by several non-US central banks, the dollar has weakened since September but remains significantly stronger than a year ago.",
+ "element_id": "1d6876589a1534c985d4671d913a6515",
+ "text": "Monetary policy starts to bite. Signs are apparent",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "e0d8de407f3ab4149ae9ec09501ca50b",
+ "text": "that monetary policy tightening is starting to cool demand and inflation, but the full impact is unlikely to be realized before 2024. Global headline inflation appears to have peaked in the third quarter of 2022 (Figure 1). Prices of fuel and nonfuel commodities have declined, lowering headline inflation, notably in the United States, the euro area, and Latin America. But underlying (core) inflation has not yet peaked in most economies and remains well above pre-pandemic levels. It has persisted amid second-round effects from earlier cost shocks and tight labor markets with robust wage growth as consumer demand has remained resilient. Medium-term inflation expectations generally remain anchored, but some gauges are up. These developments have caused central banks to raise rates faster than expected, especially in the United States and the euro area, and to signal that rates will stay elevated for longer. Core inflation is declining in some economies that have completed their tightening cycle\u2014such as Brazil. Financial markets are displaying high sensitivity to inflation news, with equity markets rising following recent releases of lower inflation data in anticipation of interest rate cuts (Box 1), despite central banks\u2019 communicating their resolve to tighten policy further. With the peak in US headline inflation and an acceleration in",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "72129da956da90c4e678bc7f831baa2f",
+ "text": "Figure 1. Twin Peaks? Headline and Core Inflation (Percent, year over year)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -426,8 +576,8 @@
},
{
"type": "Title",
- "element_id": "ea9e70213dbb306bbfc411301593a01f",
- "text": "Median country Brazil",
+ "element_id": "153b8351d764074d2c0f5bf2b2a528c7",
+ "text": "Median country",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -451,7 +601,7 @@
},
{
"type": "Title",
- "element_id": "a37a878930b52526b96231dcbcb9b3f4",
+ "element_id": "0127157dbb9f84b224f1eeecf25dc30c",
"text": "United States",
"metadata": {
"filetype": "application/pdf",
@@ -476,7 +626,7 @@
},
{
"type": "Title",
- "element_id": "cdd95d6fc1603d1c87d82ef501854019",
+ "element_id": "9b00d9f3f5dc1ca65b2f2b482f853609",
"text": "Euro area",
"metadata": {
"filetype": "application/pdf",
@@ -500,9 +650,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "f37bd3b81db4c6ef56f199b5d899aebd",
- "text": "Winter comes to Europe. European economic growth in 2022 was more resilient than expected in the face of the large negative terms-of-trade shock from the war in Ukraine. This resilience\u2013\u2013which is",
+ "type": "UncategorizedText",
+ "element_id": "3b7737beb76051b3562a749fdc734e87",
+ "text": "\u2014",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -525,9 +675,9 @@
}
},
{
- "type": "ListItem",
- "element_id": "23cc5e8cc806e807c2d2a5070bd07b1c",
- "text": "2 International Monetary Fund | January 2023",
+ "type": "UncategorizedText",
+ "element_id": "887933ebf94ea84ceb6d564a1ed0497a",
+ "text": "\u2014\u2014",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -550,9 +700,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "43f5a5eb2707a8a8f8d7fd26a78f9dca",
- "text": "Nov. \u00ab22",
+ "type": "Image",
+ "element_id": "4a16d2c1ec538127db008afd58997035",
+ "text": "Brazil 18 16 1. Headline Inflation 14 12 10 8 6 4 2 0 \u20132 Jan. 2019 Jul. 19 Jan. 20 Jul. 20 Jan. 21 Jul. 21 Jan. 22 16 14 2. Core Inflation 12 10 8 6 4 2 0 \u20132 Jan. Jul. Jan. Jul. Jan. Jul. Jan. 2019 19 20 20 21 21 22 Jul. 22 Jul. 22 Nov. 22 Nov. 22 ",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -575,9 +725,84 @@
}
},
{
- "type": "Title",
- "element_id": "010f24c0d9604698a0a97e91efcf2ae6",
- "text": "Nov.",
+ "type": "FigureCaption",
+ "element_id": "1248f309b0a6807c8a134933d85550eb",
+ "text": "Sources: Haver Analytics; and IMF staff calculations. Note: The figure shows the developments in headline and core inflation across 18 advanced economies and 17 emerging market and developing economies. Core inflation is the change in prices for goods and services, but excluding those for food and energy (or the closest available measure). For the euro area (and other European countries for which the data are available), energy, food, alcohol, and tobacco are excluded. The gray bands depict the 10th to 90th percentiles of inflation across economies.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "bbdb4ab6d6984079a115ef1e9b38c7aa",
+ "text": "rate hikes by several non-US central banks, the dollar has weakened since September but remains significantly stronger than a year ago.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "507c6366e98028004c385ed2d7f6737c",
+ "text": "Winter comes to Europe. European economic growth in 2022 was more resilient than expected in the face of the large negative terms-of-trade shock from the war in Ukraine. This resilience\u2013\u2013which is",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 3,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "809c2b1b5c4247d05c07ef923109b738",
+ "text": "2 International Monetary Fund | January 2023",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -901,8 +1126,8 @@
},
{
"type": "ListItem",
- "element_id": "af70a583660627245d8a985c424ed5ce",
- "text": "percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers.",
+ "element_id": "dc4ee21bddd5388c5390a3d6491e1d76",
+ "text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -975,15 +1200,440 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "7e8c93b4741fa4ff1a87881dcaa89686",
- "text": "For emerging market and developing economies, growth is projected to rise modestly, from 3.9 percent in 2022 to 4.0 percent in 2023 and 4.2 percent in 2024, with an upward revision of 0.3 percentage point for 2023 and a downward revision of 0.1 percentage point for 2024. About half of emerging market and developing economies have lower growth in 2023 than in 2022.",
+ "type": "NarrativeText",
+ "element_id": "7e8c93b4741fa4ff1a87881dcaa89686",
+ "text": "For emerging market and developing economies, growth is projected to rise modestly, from 3.9 percent in 2022 to 4.0 percent in 2023 and 4.2 percent in 2024, with an upward revision of 0.3 percentage point for 2023 and a downward revision of 0.1 percentage point for 2024. About half of emerging market and developing economies have lower growth in 2023 than in 2022.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "2514ae8c84a15a8ddcfda4bd5f14677a",
+ "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2 percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China\u2019s economy. China\u2019s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent\u2014the first time in more than 40 years with China\u2019s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "372dd00b56271064cbdb336f0c1edf09",
+ "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at \u20132.2 percent compared with a predicted \u20133.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "f272e9be7b941dd959170f15e9aeda36",
+ "text": "In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "a215de8ed995008ce337de99d5cd92af",
+ "text": "4 International Monetary Fund | January 2023",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "90d34f5e846e7bc2bf02fac74e7081a7",
+ "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "ffb99a5d75b910c8329e0ef1bafde120",
+ "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "995efce4c4b1e974e61f6e4205687a44",
+ "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "98fe012aca786c787983b2ecfb06dc7d",
+ "text": "In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria\u2019s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "2b171bf6d92042917f888e7977663b26",
+ "text": "Inflation Peaking",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "a8698c237c1e670a0a9076a1b967dfd3",
+ "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024\u2013\u2013above pre-pandemic (2017\u201319) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "decfc37fc750386b02317a784566d058",
+ "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024\u2013\u2013above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017\u201319) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024\u2013\u2013still high, but close to the pre-pandemic average.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "0bbd5bc12c1f007da0810f63e0759a48",
+ "text": "The balance of risks to the global outlook remains tilted to the downside, with scope for lower growth and higher inflation, but adverse risks have moderated since the October 2022 World Economic Outlook.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "8851acc6825396c414e4f510fbd1a74e",
+ "text": "International Monetary Fund | January 2023. 5",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "5ed317e88b9e032f0a07e1a59492920c",
+ "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "7b55f9cf2ab15edd5f5ddda66cd012d1",
+ "text": "Table 1. Overview of the World Economic Outlook Projections (Percent change, unless noted otherwise)",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Table",
+ "element_id": "9a04939a11dc75f1f95170cfc4e9dbc5",
+ "text": "Year over Year Difference from October 2022 Q4 over Q4 2/ Estimate Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 6.2 3.4 2.9 3.1 0.2 \u20130.1 1.9 3.2 3.0 Advanced Economies 5.4 2.7 1.2 1.4 0.1 \u20130.2 1.3 1.1 1.6 United States 5.9 2.0 1.4 1.0 0.4 \u20130.2 0.7 1.0 1.3 Euro Area 5.3 3.5 0.7 1.6 0.2 \u20130.2 1.9 0.5 2.1 Germany 2.6 1.9 0.1 1.4 0.4 \u20130.1 1.4 0.0 2.3 France 6.8 2.6 0.7 1.6 0.0 0.0 0.5 0.9 1.8 Italy 6.7 3.9 0.6 0.9 0.8 \u20130.4 2.1 0.1 1.0 Spain 5.5 5.2 1.1 2.4 \u20130.1 \u20130.2 2.1 1.3 2.8 Japan 2.1 1.4 1.8 0.9 0.2 \u20130.4 1.7 1.0 1.0 United Kingdom 7.6 4.1 \u20130.6 0.9 \u20130.9 0.3 0.4 \u20130.5 1.8 Canada 5.0 3.5 1.5 1.5 0.0 \u20130.1 2.3 1.2 1.9 Other Advanced Economies 3/ 5.3 2.8 2.0 2.4 \u20130.3 \u20130.2 1.4 2.1 2.2 Emerging Market and Developing Economies 6.7 3.9 4.0 4.2 0.3 \u20130.1 2.5 5.0 4.1 Emerging and Developing Asia 7.4 4.3 5.3 5.2 0.4 0.0 3.4 6.2 4.9 China 8.4 3.0 5.2 4.5 0.8 0.0 2.9 5.9 4.1 India 4/ 8.7 6.8 6.1 6.8 0.0 0.0 4.3 7.0 7.1 Emerging and Developing Europe 6.9 0.7 1.5 2.6 0.9 0.1 \u20132.0 3.5 2.8 Russia 4.7 \u20132.2 0.3 2.1 2.6 0.6 \u20134.1 1.0 2.0 Latin America and the Caribbean 7.0 3.9 1.8 2.1 0.1 \u20130.3 2.6 1.9 1.9 Brazil 5.0 3.1 1.2 1.5 0.2 \u20130.4 2.8 0.8 2.2 Mexico 4.7 3.1 1.7 1.6 0.5 \u20130.2 3.7 1.1 1.9 Middle East and Central Asia 4.5 5.3 3.2 3.7 \u20130.4 0.2 . . . . . . . . . Saudi Arabia 3.2 8.7 2.6 3.4 \u20131.1 0.5 4.6 2.7 3.5 Sub-Saharan Africa 4.7 3.8 3.8 4.1 0.1 0.0 . . . . . . . . . Nigeria 3.6 3.0 3.2 2.9 0.2 0.0 2.6 3.1 2.9 South Africa 4.9 2.6 1.2 1.3 0.1 0.0 3.0 0.5 1.8 Memorandum World Growth Based on Market Exchange Rates 6.0 3.1 2.4 2.5 0.3 \u20130.1 1.7 2.5 2.5 European Union 5.5 3.7 0.7 1.8 0.0 \u20130.3 1.8 1.2 2.0 ASEAN-5 5/ 3.8 5.2 4.3 4.7 \u20130.2 \u20130.2 3.7 5.7 4.0 Middle East and North Africa 4.1 5.4 3.2 3.5 \u20130.4 0.2 . . . . . . . . . Emerging Market and Middle-Income Economies 7.0 3.8 4.0 4.1 0.4 0.0 2.5 5.0 4.1 Low-Income Developing Countries 4.1 4.9 4.9 5.6 0.0 0.1 . . . . . . . . . 10.4 5.4 2.4 3.4 \u20130.1 \u20130.3 . . . . . . . . . 9.4 6.6 2.3 2.7 0.0 \u20130.4 . . . . . . . . . 12.1 3.4 2.6 4.6 \u20130.3 0.0 . . . . . . . . . 65.8 39.8 \u201316.2 \u20137.1 \u20133.3 \u20130.9 11.2 \u20139.8 \u20135.9 26.4 7.0 \u20136.3 \u20130.4 \u20130.1 0.3 \u20132.0 1.4 \u20130.2 4.7 8.8 6.6 4.3 0.1 0.2 9.2 5.0 3.5 3.1 7.3 4.6 2.6 0.2 0.2 7.8 3.1 2.3",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "2d46f6faafe2544549dd92fcbe07addd",
+ "text": "World Output",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1000,15 +1650,15 @@
}
},
{
- "type": "ListItem",
- "element_id": "c1e048bf64acdbfb4b3f07ae2c2bd9a0",
- "text": "percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China\u2019s economy. China\u2019s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent\u2014the first time in more than 40 years with China\u2019s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024.",
+ "type": "Title",
+ "element_id": "cde79e6244a9923c5712c3adc31c2f4d",
+ "text": "World Trade Volume (goods and services) 6/",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1025,15 +1675,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "9da9aade2e55fc72f03d8b5a78092503",
- "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at \u20132.2 percent compared with a predicted \u20133.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in",
+ "type": "Title",
+ "element_id": "b822f693f3cd08255c54b66e334fd26d",
+ "text": "Advanced Economies",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1050,15 +1700,15 @@
}
},
{
- "type": "ListItem",
- "element_id": "e2f761e5fbfa887c5c4654959178dd0e",
- "text": "4 International Monetary Fund | January 2023",
+ "type": "Title",
+ "element_id": "60a0feaace246d39e4f86e9cfc6e96db",
+ "text": "Emerging Market and Developing Economies",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1076,14 +1726,14 @@
},
{
"type": "Title",
- "element_id": "90d34f5e846e7bc2bf02fac74e7081a7",
- "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023",
+ "element_id": "c44ae79200013da750a6ea80d8a8fe11",
+ "text": "Commodity Prices",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1100,15 +1750,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "ffb99a5d75b910c8329e0ef1bafde120",
- "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth.",
+ "type": "Title",
+ "element_id": "8cbc99504fd7235d91f3873e0ed62d93",
+ "text": "Oil 7/",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1126,14 +1776,14 @@
},
{
"type": "NarrativeText",
- "element_id": "d75546413a70b32318e60a4439de8e7f",
- "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria\u2019s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints.",
+ "element_id": "0e84a245d7d13d46592851b1a27cb5f7",
+ "text": "Nonfuel (average based on world commodity import weights)",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1151,14 +1801,14 @@
},
{
"type": "Title",
- "element_id": "c4ed38259052e804c28ab9511fb83709",
- "text": "Inflation Peaking",
+ "element_id": "d8565931efa0e4c16aef187a34276810",
+ "text": "World Consumer Prices 8/",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1175,15 +1825,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "6fad039abd2a07c18e04427a5c0934c1",
- "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024\u2013\u2013above pre-pandemic (2017\u201319) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies.",
+ "type": "Title",
+ "element_id": "ae516f771563a4a0c897f8c86f0c7eef",
+ "text": "Advanced Economies 9/",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1200,15 +1850,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "276ca71db7194279383dbd5ed47e6401",
- "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024\u2013\u2013above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017\u201319) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024\u2013\u2013still high, but close to the pre-pandemic average.",
+ "type": "Title",
+ "element_id": "a46bed6bfca63e9f5907181bd6105c6c",
+ "text": "Emerging Market and Developing Economies 8/",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1225,15 +1875,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "dad6fdc16c791d4a298b86a74a7787cb",
- "text": "The balance of risks to the global outlook remains tilted to the downside, with scope for lower growth and higher inflation, but adverse risks have moderated since the October 2022 World Economic Outlook.",
+ "type": "UncategorizedText",
+ "element_id": "089adf893ed0036f58de8e9706275deb",
+ "text": "5.9",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1250,15 +1900,15 @@
}
},
{
- "type": "ListItem",
- "element_id": "4a9791d1daa8c2de8ae6fe9473b0806c",
- "text": "International Monetary Fund | January 2023. 5",
+ "type": "UncategorizedText",
+ "element_id": "bf7bc8145db012e18cec57be028bfc8d",
+ "text": "9.9",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
"version": "c7eed4fc056b089a98f6a3ad9ec9373e",
@@ -1275,9 +1925,9 @@
}
},
{
- "type": "Title",
- "element_id": "5ed317e88b9e032f0a07e1a59492920c",
- "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023",
+ "type": "UncategorizedText",
+ "element_id": "ec4456307c6149d30b284be6357f05a2",
+ "text": "8.1",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1300,9 +1950,9 @@
}
},
{
- "type": "Title",
- "element_id": "7b55f9cf2ab15edd5f5ddda66cd012d1",
- "text": "Table 1. Overview of the World Economic Outlook Projections (Percent change, unless noted otherwise)",
+ "type": "UncategorizedText",
+ "element_id": "fcbe2b933557443cc5f9950f8f9dac9b",
+ "text": "5.5",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1325,9 +1975,9 @@
}
},
{
- "type": "Table",
- "element_id": "4eb34e94205ace0b2308f955a58a3f0a",
- "text": "Year over Year Difference from October 2022 Q4 over Q4 2/ 2021 Estimate 2022 Projections 2023 2024 WEO Projections 1/ 2023 2024 Estimate 2022 Projections 2023 2024 6.2 3.4 2.9 3.1 0.2 \u20130.1 1.9 3.2 3.0 Advanced Economies United States Euro Area Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 \u20130.6 1.5 2.0 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 0.1 0.4 0.2 0.4 0.0 0.8 \u20130.1 0.2 \u20130.9 0.0 \u20130.3 \u20130.2 \u20130.2 \u20130.2 \u20130.1 0.0 \u20130.4 \u20130.2 \u20130.4 0.3 \u20130.1 \u20130.2 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 \u20130.5 1.2 2.1 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 Emerging Market and Developing Economies Emerging and Developing Asia China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 3.9 4.3 3.0 6.8 0.7 \u20132.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 \u20130.4 \u20131.1 0.1 0.2 0.1 \u20130.1 0.0 0.0 0.0 0.1 0.6 \u20130.3 \u20130.4 \u20130.2 0.2 0.5 0.0 0.0 0.0 2.5 3.4 2.9 4.3 \u20132.0 \u20134.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries 6.0 5.5 3.8 4.1 7.0 4.1 3.1 3.7 5.2 5.4 3.8 4.9 2.4 0.7 4.3 3.2 4.0 4.9 2.5 1.8 4.7 3.5 4.1 5.6 0.3 0.0 \u20130.2 \u20130.4 0.4 0.0 \u20130.1 \u20130.3 \u20130.2 0.2 0.0 0.1 1.7 1.8 3.7 . . . 2.5 . . . 2.5 1.2 5.7 . . . 5.0 . . . 2.5 2.0 4.0 . . . 4.1 . . . 10.4 9.4 12.1 5.4 6.6 3.4 2.4 2.3 2.6 3.4 2.7 4.6 \u20130.1 0.0 \u20130.3 \u20130.3 \u20130.4 0.0 . . . . . . . . . . . . . . . . . . . . . . . . . . . 65.8 26.4 39.8 7.0 \u201316.2 \u20136.3 \u20137.1 \u20130.4 \u20133.3 \u20130.1 \u20130.9 0.3 11.2 \u20132.0 \u20139.8 1.4 \u20135.9 \u20130.2",
+ "type": "UncategorizedText",
+ "element_id": "7c8543343d260ef01680bd865d5421c7",
+ "text": "0.0",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1350,9 +2000,9 @@
}
},
{
- "type": "Title",
- "element_id": "2d46f6faafe2544549dd92fcbe07addd",
- "text": "World Output",
+ "type": "UncategorizedText",
+ "element_id": "cdc1c70627c920de18637df922f17ae8",
+ "text": "0.2",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1376,8 +2026,8 @@
},
{
"type": "UncategorizedText",
- "element_id": "8c0af04bc7a87b5013697ac78410d6e3",
- "text": "World Trade Volume (goods and services) 6/ Advanced Economies Emerging Market and Developing Economies",
+ "element_id": "55363b264a5a2c416380c14ade9df65a",
+ "text": "10.4",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1400,9 +2050,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "61cc155370f47a3bdda30c407ce2958b",
- "text": "Commodity Prices Oil 7/ Nonfuel (average based on world commodity import weights)",
+ "type": "UncategorizedText",
+ "element_id": "9fdc4175cb32892f0cdd1bbf5de1e402",
+ "text": "6.6",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1425,9 +2075,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "7b87f70df5eb0c5c5f4e05cb89393628",
- "text": "World Consumer Prices 8/ Advanced Economies 9/ Emerging Market and Developing Economies 8/",
+ "type": "NarrativeText",
+ "element_id": "14a60bb89dd75e34c4e894ca12298218",
+ "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1450,9 +2100,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "01f6110f227ca362f21307a252d387bc",
- "text": "4.7 3.1 5.9",
+ "type": "NarrativeText",
+ "element_id": "205d2c6b91ba4b8ad601aa39e6c51282",
+ "text": "1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1475,9 +2125,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "54dc7fe009c437d116108cec181e3792",
- "text": "8.8 7.3 9.9",
+ "type": "NarrativeText",
+ "element_id": "db1914e8ceaa83aeb722e8aaf7e7c852",
+ "text": "2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1500,9 +2150,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "348fa41c29526b8d4933fa0492af810e",
- "text": "6.6 4.6 8.1",
+ "type": "NarrativeText",
+ "element_id": "dab3760c894e16c85be30799ec5754b9",
+ "text": "3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1525,9 +2175,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "6aabe10eb8cae2bc4de874c542565ec1",
- "text": "4.3 2.6 5.5",
+ "type": "NarrativeText",
+ "element_id": "bb37c2b3e527a7286e6eba57f0fced2d",
+ "text": "4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1550,9 +2200,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "41242d1e0d075b14e1153a9a6eac1abc",
- "text": "0.1 0.2 0.0",
+ "type": "Title",
+ "element_id": "06574cb65c0c27e2c82512f214bea9de",
+ "text": "5/ Indonesia, Malaysia, Philippines, Singapore, Thailand.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1576,8 +2226,8 @@
},
{
"type": "UncategorizedText",
- "element_id": "553fd3a7a662ec2190665ed75ae70c65",
- "text": "0.2 0.2 0.2",
+ "element_id": "0ff16b590c69a739da83943b6e17f6dc",
+ "text": "6/ Simple average of growth rates for export and import volumes (goods and services).",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1600,9 +2250,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "42d777ee314b8f164aabb1976e185638",
- "text": "9.2 7.8 10.4",
+ "type": "NarrativeText",
+ "element_id": "9ff51d771010925e3f6c8e6f0a040375",
+ "text": "7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1625,9 +2275,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "b536aeb880e2566ac393af151f8a53c0",
- "text": "5.0 3.1 6.6",
+ "type": "Title",
+ "element_id": "a0014ae86555bf77f85d23cec4682505",
+ "text": "8/ Excludes Venezuela.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1651,8 +2301,8 @@
},
{
"type": "NarrativeText",
- "element_id": "2ccb29d3db680f09050e16da013cfa4c",
- "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024.",
+ "element_id": "3307764891fd4a62cc0400e84d4f0165",
+ "text": "9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1676,7 +2326,7 @@
},
{
"type": "NarrativeText",
- "element_id": "fcdbec90e78c273dd191e6938b63b3aa",
+ "element_id": "4cf883e80aeb2f99200910188f532ee7",
"text": "Upside risks\u2014Plausible upside risks include more favorable surprises to domestic spending\u2014as in the third quarter of 2022\u2014which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:",
"metadata": {
"filetype": "application/pdf",
@@ -1701,8 +2351,8 @@
},
{
"type": "ListItem",
- "element_id": "05b94a59813751cc052b233294eea3bf",
- "text": "support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption\u2014particularly of services, including tourism.",
+ "element_id": "a226b17757fe212f8d04ee9fad9fc92b",
+ "text": "Pent-up demand boost: Fueled by the stock of excess private savings from the pandemic fiscal support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption\u2014particularly of services, including tourism.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1726,7 +2376,7 @@
},
{
"type": "ListItem",
- "element_id": "a92c9c37dd45ab95af9ee011a431bbfd",
+ "element_id": "29e7108e3c30105f858cdcad9c4b4189",
"text": "6 International Monetary Fund | January 2023",
"metadata": {
"filetype": "application/pdf",
@@ -1751,8 +2401,8 @@
},
{
"type": "UncategorizedText",
- "element_id": "36b409ff8f7f08da8322fb6945b054ef",
- "text": "3.5 2.3 4.5",
+ "element_id": "6c50e486db99ce57c0a2eb0558d6586d",
+ "text": "4.5",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1876,8 +2526,8 @@
},
{
"type": "ListItem",
- "element_id": "9d5064113562605b7a1b3f1bc24840a4",
- "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems.",
+ "element_id": "0a4f402b21ca4eb05c8b2d1d9940d49c",
+ "text": "China\u2019s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1901,8 +2551,8 @@
},
{
"type": "ListItem",
- "element_id": "a020d627fc081a56453ab6b3ee8c0881",
- "text": "vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China\u2019s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase.",
+ "element_id": "f258a4e73392b053b7cd4cfd1f455c7e",
+ "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China\u2019s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1925,9 +2575,34 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "5a4931739cd615032ea03860c3ace150",
- "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China\u2019s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy.",
+ "type": "ListItem",
+ "element_id": "d483fc1e7a6b11dc7c029afef53f19c2",
+ "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 8,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "ListItem",
+ "element_id": "b1d0038f42c249af7b65ac64d90ed36c",
+ "text": "Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China\u2019s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1951,7 +2626,7 @@
},
{
"type": "ListItem",
- "element_id": "5c53b2d3f514a92cc6c099fd9a46b49a",
+ "element_id": "a2a6d319d6d69c9f012f209332552ba6",
"text": "Sudden financial market repricing: A premature easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy.",
"metadata": {
"filetype": "application/pdf",
@@ -1975,9 +2650,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "e3c931f10a12b33fb9aeb6427a16c7ae",
- "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing",
+ "type": "ListItem",
+ "element_id": "ecc3a5976f8b96e1a909f9a8aa8e8441",
+ "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2001,8 +2676,8 @@
},
{
"type": "ListItem",
- "element_id": "84d70cebf2068479ad03b1f4fca89141",
- "text": "earlier geopolitical tensions, such as those associated with the US-China trade dispute.",
+ "element_id": "0065ea6e5228a2d5574ba0d589525726",
+ "text": "pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2026,7 +2701,7 @@
},
{
"type": "ListItem",
- "element_id": "9fda15cda44df99e579f024b037349d9",
+ "element_id": "213dfe37b16f104c8b97aac565751c03",
"text": "International Monetary Fund | January 2023. 7",
"metadata": {
"filetype": "application/pdf",
@@ -2401,8 +3076,8 @@
},
{
"type": "ListItem",
- "element_id": "65269b45848d66c7d8b45099ddf6a328",
- "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global",
+ "element_id": "92e17c7b49f5b47d390e74eb4570e1be",
+ "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2425,9 +3100,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "2ae3be4a3aa90a31eea5a5a306d3837c",
- "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty\u2019s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non\u2013 Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes.",
+ "type": "ListItem",
+ "element_id": "70fa5a835209352e5eabef3536f54ea6",
+ "text": "Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty\u2019s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non\u2013 Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2626,8 +3301,8 @@
},
{
"type": "Image",
- "element_id": "16c33c9e209b518305829584935190dd",
- "text": "7 6 5 4 United States Euro area China Other AEs Other EMs October 2022 GFSR 3 2 1 0 \u20131 \u20132 \u20133 2006 08 08 06 10 10 12 12 14 16 14 16 18 18 20 22 22 20 ",
+ "element_id": "23816dd0ae1864f91d16153d457c27a2",
+ "text": "7 6 5 4 3 United States Euro area China Other AEs Other EMs October 2022 GFSR 2 1 0 \u20131 \u20132 \u20133 2006 08 06 08 10 10 12 12 14 16 14 16 18 18 20 22 20 22 ",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2676,8 +3351,33 @@
},
{
"type": "NarrativeText",
- "element_id": "fd3223759c188ca12a81861f36083bb9",
- "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors\u2019 anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia\u2019s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives.",
+ "element_id": "883a98747b49966e50dd49bd113e1e7e",
+ "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors\u2019 anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 11,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "968e19e279ea80fbf994f5d88dfb8e5d",
+ "text": "However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia\u2019s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2701,8 +3401,33 @@
},
{
"type": "NarrativeText",
- "element_id": "9f931c3b54669a87892391846beda98c",
- "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)",
+ "element_id": "c6c6a6b0c3cbc1ea71cf7c3c01a58334",
+ "text": "Figure 1.2. Market-Implied Expectations of Policy Rates",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 11,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "7ecca4609cad7db875f50a681f44c018",
+ "text": "(Percent)",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2726,8 +3451,108 @@
},
{
"type": "Image",
- "element_id": "bffdca980631ced1f96ab886cf9dcf22",
- "text": "Latest October 2022 GFSR 6 1. United States 2. Euro area 5 4 3 2 5 4 3 2 1 1 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 ",
+ "element_id": "5f7378ac2edd8680261c6c3d7d09d218",
+ "text": "Latest October 2022 GFSR 6 1. United States 2. Euro area 5 5 4 4 3 3 2 2 1 Oct. Apr. Oct. Dec. Dec. Oct. Apr. Oct. Dec. Dec. 1 22 23 23 24 26 22 23 23 24 26 ",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 11,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "3a23d2294f9cfbb497818d718219197b",
+ "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 11,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "1a36b434e009ab68418605545fc8d0fc",
+ "text": "While many central banks in advanced economies have stepped",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 11,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "FigureCaption",
+ "element_id": "15aa1a08bc273dd83a060288e81eb612",
+ "text": "Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 11,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": "c7eed4fc056b089a98f6a3ad9ec9373e",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/",
+ "metadata": {
+ "ingest-test": "custom metadata"
+ }
+ },
+ "date_created": "1720544414.0",
+ "date_modified": "1720544414.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "a2b56bcce2b2bb67011eee8026399e51",
+ "text": "down the size of hikes, they have also explicitly stated they will need",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2751,8 +3576,8 @@
},
{
"type": "NarrativeText",
- "element_id": "f787afd6d27e6bfdc89bd82044a417a6",
- "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess their outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked.",
+ "element_id": "19df82c029a3ce4664e1bab599208e9c",
+ "text": "to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess their outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2776,7 +3601,7 @@
},
{
"type": "NarrativeText",
- "element_id": "b0002e1597c04292e2e01d4e15cb0dd7",
+ "element_id": "a01645da40596bc8aa78e59b61a7909c",
"text": "Financial market volatility is expected to remain elevated and could be exacerbated by poor market liquidity. For some asset classes (such as US Treasuries), liquidity has deteriorated to the March 2020 lows of the COVID-19 pandemic. With the process of central bank balance sheet reduction (quantitative tightening) underway, market liquidity is expected to remain challenging.",
"metadata": {
"filetype": "application/pdf",
@@ -2801,7 +3626,7 @@
},
{
"type": "ListItem",
- "element_id": "2a9dc522d08d54609f97f566911ceed1",
+ "element_id": "df734ee213c4062e841e9590038ed56e",
"text": "10 \u2014 International Monetary Fund | January 2023",
"metadata": {
"filetype": "application/pdf",
@@ -2826,7 +3651,7 @@
},
{
"type": "NarrativeText",
- "element_id": "103cccd2abc41bc4beb9e70dff33123a",
+ "element_id": "66921f38d7525151f33aaac16264d3d1",
"text": "WEO Update \u00a9 2023 \u2022 ISBN: 979-8-40023-224-4",
"metadata": {
"filetype": "application/pdf",
diff --git a/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json
index 160ca011c0..a5ffc03652 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json
@@ -485,8 +485,74 @@
},
{
"type": "Title",
- "element_id": "33098f034d26b209b3e70be26465960f",
- "text": "h W T",
+ "element_id": "92bd98667f1b34402aa093c954a6dfa7",
+ "text": "h",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "5bca76dfe3baf190899a22ae09c9ca43",
+ "text": "s",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "6f9184816d5638abaf3a223c4698a7f5",
+ "text": "W",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "3c6abbc913762fde696988b9735a9a45",
+ "text": "T",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -507,7 +573,7 @@
},
{
"type": "FigureCaption",
- "element_id": "01f202d61754b07afdbc86562947a713",
+ "element_id": "8c5b01e2180e90c104a5c6ff3dddb01b",
"text": "Figure 1. IEA projected electricity production and sources to 2040 i",
"metadata": {
"filetype": "application/pdf",
@@ -529,7 +595,7 @@
},
{
"type": "NarrativeText",
- "element_id": "b4a845da7b0af6bf15d5fb9660b35338",
+ "element_id": "3635fd9bacc9ab79dcd020f4532bc440",
"text": "The challenge before us, however, goes far beyond just electricity \u2013 we will need to find ways to decarbonize all parts of the economy, and we need solutions that are sustainable in the long-term. That means changing the way we heat our homes and power our industrial processes, as well as ensuring that the way we travel, export our products and ship our food moves away from fossil fuels.",
"metadata": {
"filetype": "application/pdf",
@@ -551,7 +617,7 @@
},
{
"type": "NarrativeText",
- "element_id": "034833d2c3067bb56f6083bed72f01e5",
+ "element_id": "df25ad16fbeb65884633621623966031",
"text": "Despite the very considerable efforts to decarbonize the economy and the countless billions spent, our world remains heavily addicted to fossil fuels. The trend is clear \u2013 instead of reducing our dependence on fossil fuels, we are increasing it (Figure 2). As a direct result, greenhouse gas emissions continue to rise when they need to drastically fall.",
"metadata": {
"filetype": "application/pdf",
@@ -573,8 +639,74 @@
},
{
"type": "Title",
- "element_id": "471efbe23e80b54343222bebe9140db9",
- "text": "h W G",
+ "element_id": "1292196cd3253d4e3dcfd0a473c74a35",
+ "text": "h",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "0d35bab1e280d50b0f8669c2f9648684",
+ "text": "GWh",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "0edc1cde085a3ad401d79a41389b1499",
+ "text": "W",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "6096321cc3ef501eb98fcf813b886324",
+ "text": "G",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -595,7 +727,7 @@
},
{
"type": "Image",
- "element_id": "6145856d34f52d6758bdb7b21375c456",
+ "element_id": "88472fa496081ab6787b366ffede21f5",
"text": "30,000,000 High-carbon Low-carbon 25,000,000 20,000,000 15,000,000 10,000,000 5,000,000 0 1990 1995 2000 2005 2010 2015 ",
"metadata": {
"filetype": "application/pdf",
@@ -617,7 +749,7 @@
},
{
"type": "FigureCaption",
- "element_id": "f078972fc2928675f735e4d249863c0d",
+ "element_id": "f864518bc1dd07faecda3b58e676c3db",
"text": "Figure 2. Worldwide electricity generation by fuel (1990-2016)ii",
"metadata": {
"filetype": "application/pdf",
@@ -660,15 +792,477 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "7d04fe32568029462cb22a1d00c986f6",
- "text": "Nuclear energy is already making a major contribution. By using nuclear energy rather than fossil fuels, we currently avoid the emission of more than 2500 million tonnes of carbon dioxide every year. To put that into perspective, it is the equivalent of removing about 400 million cars from the world\u2019s roads.",
+ "type": "NarrativeText",
+ "element_id": "7d04fe32568029462cb22a1d00c986f6",
+ "text": "Nuclear energy is already making a major contribution. By using nuclear energy rather than fossil fuels, we currently avoid the emission of more than 2500 million tonnes of carbon dioxide every year. To put that into perspective, it is the equivalent of removing about 400 million cars from the world\u2019s roads.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Image",
+ "element_id": "74fede67ccf426e4acb08ec09aef38de",
+ "text": "",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "decd1a58dda4f427da1a4b5468337852",
+ "text": "Modern society is dependent on the steady supply of electricity, every day of the year \u2013 regardless of weather, season or time of day \u2013 and nuclear energy is particularly well-suited to providing this service. Given that the majority of baseload supply is fossil-based, an increase in the use of nuclear energy would result in a rapid decarbonization of the electricity system. The International Energy Agency\u2019s (IEA) recent report III on nuclear energy highlighted the importance of dependable baseload electricity generators and the need to properly value and compensate them for the electricity security and reliability services they provide.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Image",
+ "element_id": "ef9cc2d4562f48ab8ef202bce5b108c0",
+ "text": "",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Image",
+ "element_id": "ca9838a7dfb6d5d996f1ab6acd7642af",
+ "text": "",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Image",
+ "element_id": "c65e2845b99488cbce5ee718c2bd054a",
+ "text": "",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Footer",
+ "element_id": "812d182abc51f7c24b14024f28bdc0b7",
+ "text": "3",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Header",
+ "element_id": "3ef3ab436a1c66d6c7aa00cdcfc40873",
+ "text": "4",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "c73abd40dd196cc77bb03f2567f46b03",
+ "text": "Despite impressive recent growth, the stark reality is that renewables alone will not be able to resolve our dependence on fossil fuels. Clearly, the sun does not always shine, and the wind does not always blow, and this is compounded by the fact that many times these periods coincide with when electricity demand is at its highest, but renewables can be complementary to nuclear energy. Storage solutions, such as batteries, will not be able to power our societies for days or weeks when the weather is not favourable. Natural gas is currently the most used solution for the intermittency problem, which only serves to reinforce our economy\u2019s dependence of fossil fuels, and severely undermines the apparently \u2018green credentials\u2019 of many renewables.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "f7078a4205d6d8b21783483c5b426c02",
+ "text": "Moving to a sustainable future",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "62b6b9ec7e962db41cf8818f42b2bc4d",
+ "text": "The Intergovernmental Panel on Climate Change (IPCC) special report on Global Warming of 1.5\u00b0C iv examined a large number of different scenarios for limiting global warming to 1.5\u00b0C. Of those scenarios which would achieve the 1.5\u00b0C target, the mean increase in nuclear energy\u2019s contribution to electricity production was 2.5 times higher compared to today. However, the \u2018middle-of-the-road\u2019 scenario \u2013 in which social, economic, and technological trends follow current patterns and would not require major changes to, for example, diet and travel habits \u2013 sees the need for nuclear increase by five times globally by 2050.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "2673340156781d2b4e8c53921e661b1d",
+ "text": "The IEA has concluded that without an expanded contribution from nuclear energy, the already huge challenge of achieving emissions reductions will become drastically harder and more costly. In their latest report on nuclear energy v, published in 2019, they also conclude that not using nuclear would have negative implications for energy security and result in higher costs for the consumers. The IEA recommends policy reforms to \u2018\u2026 ensure competition on a level playing field\u2019 and that the \u2018\u2026 focus should be on designing electricity markets in a way that values the clean energy and energy security attributes of low-carbon technologies, including nuclear power.\u2019 Such reforms should also ensure that reliability of electricity production is properly valued and compensated.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "8f9b23ea4d2d725ef37c7766f75d8b5d",
+ "text": "As part of the Harmony Programme, the world\u2019s nuclear industry has identified three key policy areas for action to unlock the true potential of nuclear energy - the need for a level playing field, the harmonization of regulations and the establishment of an effective safety paradigm.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "a24ec4b8d4de88ccfcb84af80a57d36d",
+ "text": "In regard to the need for a level playing field, we see that many of the world\u2019s electricity markets operate in an unsustainable fashion, dominated by short-term thinking. Electricity supply which is affordable, reliable and available 24/7 generates broad societal benefits, and as seen in Figure 3, nuclear is one of the most affordable electricity sources.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Image",
+ "element_id": "4178b04e063bf5bc18e18b6c4ce1fa58",
+ "text": "",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 6,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "ae719e11d8d255dfe24e2a27863a694a",
+ "text": "h",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "23fc271cbcb898bc9bfa66860987a046",
+ "text": "s",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "d637d0027b1798609aff4ec36b71094e",
+ "text": "W",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "a31c027dbc6e64b2ee18dc419cd3dc81",
+ "text": "M",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "d515aa2a05bf154c5796d9e8cc3e596c",
+ "text": "/",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "9502fdd43671bea25f50ed487389b450",
+ "text": "&",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 7,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "81ee0d51cb69b9dcdd51b32b79466961",
+ "text": "$",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -683,14 +1277,14 @@
},
{
"type": "Image",
- "element_id": "74fede67ccf426e4acb08ec09aef38de",
- "text": "",
+ "element_id": "3f7011d7133e8cea69708fd140ed26de",
+ "text": "300 250 200 150 100 50 0 m ercial Photovoltaic C o m O nshore Wind Offshore Wind N uclear C C G T C oal ",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -704,15 +1298,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "decd1a58dda4f427da1a4b5468337852",
- "text": "Modern society is dependent on the steady supply of electricity, every day of the year \u2013 regardless of weather, season or time of day \u2013 and nuclear energy is particularly well-suited to providing this service. Given that the majority of baseload supply is fossil-based, an increase in the use of nuclear energy would result in a rapid decarbonization of the electricity system. The International Energy Agency\u2019s (IEA) recent report III on nuclear energy highlighted the importance of dependable baseload electricity generators and the need to properly value and compensate them for the electricity security and reliability services they provide.",
+ "type": "FigureCaption",
+ "element_id": "3ee5ff75d0029b180337207ef8475230",
+ "text": "Figure 3. Comparative cost projections for main electricity generators vi",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -726,15 +1320,15 @@
}
},
{
- "type": "Image",
- "element_id": "ef9cc2d4562f48ab8ef202bce5b108c0",
- "text": "",
+ "type": "NarrativeText",
+ "element_id": "736fa1bab48483dff83004e8ff95165a",
+ "text": "However, markets fail to give due credit to electricity generators, such as nuclear energy, that are able to meet these societal demands. This has resulted in situations where nuclear energy has struggled to compete with energy sources that have been subsidized, do not pay the hidden costs brought on by their intermittency (e.g. costly backup provisions and investments in the grid), or do not have to take responsibility for using our common atmosphere as a dumping ground.",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -748,15 +1342,15 @@
}
},
{
- "type": "Image",
- "element_id": "ca9838a7dfb6d5d996f1ab6acd7642af",
- "text": "",
+ "type": "NarrativeText",
+ "element_id": "c9fcac110dbbc9ae9bc4f1107e7ba2fb",
+ "text": "Additionally, electricity markets fail to recognize the relative costs of different forms of electricity generation. Whilst the nuclear industry takes responsibility for its lifecycle costs (including decommissioning and waste management), other electricity generators do not. Fossil fuel generators are rarely required to pay the price in line with the environmental and health damage that their emissions cause, whilst the cost of wind and solar does not include the disposal of the sometimes toxic materials at the end of their life.",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -770,15 +1364,15 @@
}
},
{
- "type": "Image",
- "element_id": "c65e2845b99488cbce5ee718c2bd054a",
- "text": "",
+ "type": "NarrativeText",
+ "element_id": "5910d2560146818c7e220a0ea9908e81",
+ "text": "In regard to the need to harmonize regulations, multiple regulatory barriers stemming from diverse national licensing processes and safety requirements currently limit global nuclear trade and investment. A lack of international standardization places unnecessary regulatory burdens on nuclear activities and causes delays in the licensing of new designs, hindering innovation.",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -792,15 +1386,15 @@
}
},
{
- "type": "Footer",
- "element_id": "812d182abc51f7c24b14024f28bdc0b7",
- "text": "3",
+ "type": "NarrativeText",
+ "element_id": "9ccf535979da441888fc6eb549cc551a",
+ "text": "The International Atomic Energy Agency (IAEA) has highlighted the importance of addressing this issue, concluding that the lack of regulatory harmony \u2018\u2026causes many drawbacks for the entire nuclear industry, including developers, vendors, operators and even regulators themselves\u2026This results in increased costs and reduced predictability in project execution\u2019. vii It is therefore crucial that we harmonize the regulatory process to address these weaknesses, and avoid unnecessary duplication and inconsistencies.",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 5,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -814,15 +1408,15 @@
}
},
{
- "type": "Header",
- "element_id": "3ef3ab436a1c66d6c7aa00cdcfc40873",
- "text": "4",
+ "type": "Image",
+ "element_id": "ac8c0ee7cb998fca52a55248ca187c20",
+ "text": "",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -836,15 +1430,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "c73abd40dd196cc77bb03f2567f46b03",
- "text": "Despite impressive recent growth, the stark reality is that renewables alone will not be able to resolve our dependence on fossil fuels. Clearly, the sun does not always shine, and the wind does not always blow, and this is compounded by the fact that many times these periods coincide with when electricity demand is at its highest, but renewables can be complementary to nuclear energy. Storage solutions, such as batteries, will not be able to power our societies for days or weeks when the weather is not favourable. Natural gas is currently the most used solution for the intermittency problem, which only serves to reinforce our economy\u2019s dependence of fossil fuels, and severely undermines the apparently \u2018green credentials\u2019 of many renewables.",
+ "type": "Footer",
+ "element_id": "14648c87ef8c4b36c2a20d71cc6ffc36",
+ "text": "5",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 7,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -858,15 +1452,15 @@
}
},
{
- "type": "Title",
- "element_id": "f7078a4205d6d8b21783483c5b426c02",
- "text": "Moving to a sustainable future",
+ "type": "Header",
+ "element_id": "eda2d3adf2e4a1f9064252ed95826bf6",
+ "text": "6",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -881,14 +1475,14 @@
},
{
"type": "NarrativeText",
- "element_id": "62b6b9ec7e962db41cf8818f42b2bc4d",
- "text": "The Intergovernmental Panel on Climate Change (IPCC) special report on Global Warming of 1.5\u00b0C iv examined a large number of different scenarios for limiting global warming to 1.5\u00b0C. Of those scenarios which would achieve the 1.5\u00b0C target, the mean increase in nuclear energy\u2019s contribution to electricity production was 2.5 times higher compared to today. However, the \u2018middle-of-the-road\u2019 scenario \u2013 in which social, economic, and technological trends follow current patterns and would not require major changes to, for example, diet and travel habits \u2013 sees the need for nuclear increase by five times globally by 2050.",
+ "element_id": "2ee71305ea2aeb159dce93675cb959d8",
+ "text": "In regard to the need for a holistic safety paradigm for the whole electricity system, we need to consider safety from a societal perspective, something the current energy system fails to do. The health, environmental and safety benefits of nuclear energy are not sufficiently understood and valued when compared with other electricity sources. Nuclear energy remains the safest form of electricity generation (Figure 4). Additionally, the use of nuclear consistently prevents many tens of thousands of deaths (mainly resulting from air pollution) every year by avoiding the use of coal - lifesaving measures which must be better recognised and valued.",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -902,15 +1496,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "2673340156781d2b4e8c53921e661b1d",
- "text": "The IEA has concluded that without an expanded contribution from nuclear energy, the already huge challenge of achieving emissions reductions will become drastically harder and more costly. In their latest report on nuclear energy v, published in 2019, they also conclude that not using nuclear would have negative implications for energy security and result in higher costs for the consumers. The IEA recommends policy reforms to \u2018\u2026 ensure competition on a level playing field\u2019 and that the \u2018\u2026 focus should be on designing electricity markets in a way that values the clean energy and energy security attributes of low-carbon technologies, including nuclear power.\u2019 Such reforms should also ensure that reliability of electricity production is properly valued and compensated.",
+ "type": "Image",
+ "element_id": "8e0bbd5fb39debf8b0c1fb39729e3d3e",
+ "text": "140 120 120 e 100 99.5 80 71.9 60 40 20 0 C oal Oil N atural gas 8.5 1.78 Offshore wind O nshore wind (U K) (G erm any) 0.245 S olar P V <0.01 N uclear* ",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -924,15 +1518,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "8f9b23ea4d2d725ef37c7766f75d8b5d",
- "text": "As part of the Harmony Programme, the world\u2019s nuclear industry has identified three key policy areas for action to unlock the true potential of nuclear energy - the need for a level playing field, the harmonization of regulations and the establishment of an effective safety paradigm.",
+ "type": "Title",
+ "element_id": "de42faf5943e8ec777eb889399b1233e",
+ "text": "r",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -946,15 +1540,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "a24ec4b8d4de88ccfcb84af80a57d36d",
- "text": "In regard to the need for a level playing field, we see that many of the world\u2019s electricity markets operate in an unsustainable fashion, dominated by short-term thinking. Electricity supply which is affordable, reliable and available 24/7 generates broad societal benefits, and as seen in Figure 3, nuclear is one of the most affordable electricity sources.",
+ "type": "Title",
+ "element_id": "ac9c6ff58f5dc8d39939e2530a613d41",
+ "text": "a",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -968,15 +1562,15 @@
}
},
{
- "type": "Image",
- "element_id": "4178b04e063bf5bc18e18b6c4ce1fa58",
- "text": "",
+ "type": "Title",
+ "element_id": "0b221d5be4ec069b38af3b21b63ae23e",
+ "text": "e",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 6,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -991,14 +1585,14 @@
},
{
"type": "Title",
- "element_id": "792f93fd194ae2fea77fd5a477bb1d5e",
- "text": "h W M / $",
+ "element_id": "54f11d2c83e64a62db8a608445b3cf18",
+ "text": "y",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1012,15 +1606,15 @@
}
},
{
- "type": "Image",
- "element_id": "9cad28e919656850c215a73aea60024c",
- "text": "300 250 200 150 100 50 0 m ercial Photovoltaic C o m O nshore Wind Offshore Wind N uclear C C G T C oal ",
+ "type": "Title",
+ "element_id": "388cebdd46c378c6e9996a827df32e49",
+ "text": "W",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1034,15 +1628,15 @@
}
},
{
- "type": "FigureCaption",
- "element_id": "2821726ab54800008a4692eb9f89171f",
- "text": "Figure 3. Comparative cost projections for main electricity generators vi",
+ "type": "Title",
+ "element_id": "12956572dcf3fa03393f3a68377eddf0",
+ "text": "T",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1056,15 +1650,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "3d5cde2213eaec20cc2c5c77e68354cb",
- "text": "However, markets fail to give due credit to electricity generators, such as nuclear energy, that are able to meet these societal demands. This has resulted in situations where nuclear energy has struggled to compete with energy sources that have been subsidized, do not pay the hidden costs brought on by their intermittency (e.g. costly backup provisions and investments in the grid), or do not have to take responsibility for using our common atmosphere as a dumping ground.",
+ "type": "Title",
+ "element_id": "66d76c4c8747fcb81ba4fc9fd381a78a",
+ "text": "r",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1078,15 +1672,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "e37ee5f00fe8522280ab816ef335e148",
- "text": "Additionally, electricity markets fail to recognize the relative costs of different forms of electricity generation. Whilst the nuclear industry takes responsibility for its lifecycle costs (including decommissioning and waste management), other electricity generators do not. Fossil fuel generators are rarely required to pay the price in line with the environmental and health damage that their emissions cause, whilst the cost of wind and solar does not include the disposal of the sometimes toxic materials at the end of their life.",
+ "type": "Title",
+ "element_id": "e9301dcb3ceb1b4bad5fb799361d8acb",
+ "text": "e",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1100,15 +1694,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "f26c420afe7376f1d600ec5c54a51c46",
- "text": "In regard to the need to harmonize regulations, multiple regulatory barriers stemming from diverse national licensing processes and safety requirements currently limit global nuclear trade and investment. A lack of international standardization places unnecessary regulatory burdens on nuclear activities and causes delays in the licensing of new designs, hindering innovation.",
+ "type": "Title",
+ "element_id": "9c76907741cad579207a5029987e76c9",
+ "text": "p",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1122,15 +1716,15 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "0bd2429b9134c65e8e70f0763ba328c7",
- "text": "The International Atomic Energy Agency (IAEA) has highlighted the importance of addressing this issue, concluding that the lack of regulatory harmony \u2018\u2026causes many drawbacks for the entire nuclear industry, including developers, vendors, operators and even regulators themselves\u2026This results in increased costs and reduced predictability in project execution\u2019. vii It is therefore crucial that we harmonize the regulatory process to address these weaknesses, and avoid unnecessary duplication and inconsistencies.",
+ "type": "Title",
+ "element_id": "a4f0803790d664e40fcd0a4bc64314f5",
+ "text": "s",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1144,15 +1738,15 @@
}
},
{
- "type": "Image",
- "element_id": "b6e8daf0bbaa32c5f37ac261b325affb",
- "text": "",
+ "type": "Title",
+ "element_id": "63043c3c2ee23593b0320a6d3837c5ac",
+ "text": "e",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1166,15 +1760,15 @@
}
},
{
- "type": "Footer",
- "element_id": "923523789c0e08c68514bd56bdff607e",
- "text": "5",
+ "type": "Title",
+ "element_id": "5f09dcb328ce448d02364226c9cab2e7",
+ "text": "i",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 7,
+ "page_number": 8,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1188,9 +1782,9 @@
}
},
{
- "type": "Header",
- "element_id": "eda2d3adf2e4a1f9064252ed95826bf6",
- "text": "6",
+ "type": "Title",
+ "element_id": "f862278c28cd9eb349749c1cb2905f11",
+ "text": "t",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1210,9 +1804,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "2ee71305ea2aeb159dce93675cb959d8",
- "text": "In regard to the need for a holistic safety paradigm for the whole electricity system, we need to consider safety from a societal perspective, something the current energy system fails to do. The health, environmental and safety benefits of nuclear energy are not sufficiently understood and valued when compared with other electricity sources. Nuclear energy remains the safest form of electricity generation (Figure 4). Additionally, the use of nuclear consistently prevents many tens of thousands of deaths (mainly resulting from air pollution) every year by avoiding the use of coal - lifesaving measures which must be better recognised and valued.",
+ "type": "Title",
+ "element_id": "c695a9aedca91bc8f392c2783835ef8d",
+ "text": "i",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1232,9 +1826,9 @@
}
},
{
- "type": "Image",
- "element_id": "b0f97a1e3e431fa2d186e2e8fde4dc18",
- "text": "140 e 120 100 120 99.5 80 60 71.9 40 20 0 C oal Oil N atural gas 8.5 1.78 Offshore wind O nshore wind (G erm any) (U K) 0.245 S olar P V <0.01 N uclear* ",
+ "type": "Title",
+ "element_id": "d34b367940f7860d5f5e4fd2cc4b8b70",
+ "text": "l",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1254,9 +1848,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "ff300ef7a936a82104bf89668428c28c",
- "text": "r a e y",
+ "type": "Title",
+ "element_id": "cca17c0a2b67304725d20609502adca5",
+ "text": "a",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1277,8 +1871,8 @@
},
{
"type": "Title",
- "element_id": "f6ad5695f9388190089a73994c4afbf3",
- "text": "W T",
+ "element_id": "ac31374fe4ed34baf3eb2003f7351892",
+ "text": "t",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1298,9 +1892,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "54f6293185bdb444eb1802ba80bee640",
- "text": "r e p s e i t i l",
+ "type": "Title",
+ "element_id": "9d4bb2307dd7cbdec23db892896abddf",
+ "text": "a",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1321,8 +1915,8 @@
},
{
"type": "Title",
- "element_id": "a98bee8174850612900731525ab6659a",
- "text": "a t a F",
+ "element_id": "a6197f50085f3554dc07c2877ee7b41b",
+ "text": "F",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1343,7 +1937,7 @@
},
{
"type": "FigureCaption",
- "element_id": "15cd2f288bebfc1813d77e0aa050a0f8",
+ "element_id": "3aa84716ff5233e822c7249690b2d8b4",
"text": "Figure 4. Comparison of number of fatalities due to electricity generation viii",
"metadata": {
"filetype": "application/pdf",
@@ -1365,7 +1959,7 @@
},
{
"type": "Title",
- "element_id": "33b46a3854e000ba41f08b7ef320f2f8",
+ "element_id": "7a65402e47d11b495c66e713559a999a",
"text": "Nuclear for a sustainable tomorrow",
"metadata": {
"filetype": "application/pdf",
@@ -1387,7 +1981,7 @@
},
{
"type": "NarrativeText",
- "element_id": "bd7211e18d89695df6a6de6a483df434",
+ "element_id": "b4109415071978b3a71ed9f647019699",
"text": "Nuclear energy is already making a significant contribution to providing the world with clean and abundant electricity, and has a proven track record of being a reliable workhorse around the world. Countries like France, Sweden and Switzerland have proven that it is possible to divorce economic growth from an increase in damaging emissions and over the timescales required to effectively challenge climate change and environmental degradation (Figures 5 and 6). Nuclear can ensure that fast-growing populations achieve rising standards of living \u2013 without having to sacrifice the planet or their own well-being.",
"metadata": {
"filetype": "application/pdf",
@@ -1409,7 +2003,7 @@
},
{
"type": "Image",
- "element_id": "89efb2f48dddb7494311db2e7042ee35",
+ "element_id": "15b67cc3eb0e430dae3e8d9e12e4e37e",
"text": "100 90 Coal Gas/Oil 80 Biofuels/Waste 70 Wind/Solar 60 Hydro Nuclear 50 40 30 20 10 0 ",
"metadata": {
"filetype": "application/pdf",
@@ -1431,7 +2025,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "c591dd1b7ac1725959db78546ef69f71",
+ "element_id": "80738a23e7bf88f3820128d60a895089",
"text": "%",
"metadata": {
"filetype": "application/pdf",
@@ -1453,7 +2047,7 @@
},
{
"type": "Title",
- "element_id": "dc980c7616ede261ebcd41fe78bda526",
+ "element_id": "61ecf2c61ec7d6215080d9d1664e5d1c",
"text": "France",
"metadata": {
"filetype": "application/pdf",
@@ -1475,7 +2069,7 @@
},
{
"type": "Title",
- "element_id": "90bae966dbf38298ae084516c468e916",
+ "element_id": "9422c33c0a8fabbf64e82e95d5e0b207",
"text": "Sweden",
"metadata": {
"filetype": "application/pdf",
@@ -1497,7 +2091,7 @@
},
{
"type": "Title",
- "element_id": "29a073bcccb94157b4ae3f71ff1f88ea",
+ "element_id": "0e546540a9a9aa70d99851239daecf14",
"text": "Switzerland",
"metadata": {
"filetype": "application/pdf",
@@ -1519,7 +2113,7 @@
},
{
"type": "FigureCaption",
- "element_id": "f2db72028ec4f864ab56fdc75cd85e35",
+ "element_id": "07c8425839011f9d80e1b97c3efe9c4c",
"text": "Figure 5. The importance of nuclear in ensuring clean energy systems in France, Sweden and Switzerland ix",
"metadata": {
"filetype": "application/pdf",
@@ -1563,8 +2157,8 @@
},
{
"type": "Title",
- "element_id": "d25c222637cda44434c666939ff6a267",
- "text": "h W T",
+ "element_id": "089092d37b3ef254dd7bf8d75eb6155e",
+ "text": "h",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1584,9 +2178,9 @@
}
},
{
- "type": "FigureCaption",
- "element_id": "b2b7585faff336f0134b3b48464cf4c6",
- "text": "1974 1980 1985 1990 1995 2000 2005 2010 2017",
+ "type": "Title",
+ "element_id": "8416450b038776db835d2cb556f4bb43",
+ "text": "W",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1606,9 +2200,9 @@
}
},
{
- "type": "FigureCaption",
- "element_id": "f71ca4190bbe3083f902fcb238f6c22f",
- "text": "Figure 6. The lasting decarbonization of French electricity and nuclear\u2019s ability to meet growing demand x",
+ "type": "UncategorizedText",
+ "element_id": "163aff0137139580e9130822a33b76e9",
+ "text": "=",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1628,9 +2222,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "04e02981e7fd3a4a7d8e5a064f0f5775",
- "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy \u2013 enhanced independence and security in uncertain times.",
+ "type": "Title",
+ "element_id": "419111610b2c95d41868ef56de17b1fb",
+ "text": "T",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1650,9 +2244,9 @@
}
},
{
- "type": "Image",
- "element_id": "f05002752f0e36d49670de3052a5cb7d",
- "text": "One fuel pellet contains as much energy as a tonne of coal ",
+ "type": "FigureCaption",
+ "element_id": "098e07fdce3f0d6957815fa4485ed250",
+ "text": "1974 1980 1985 1990 1995 2000 2005 2010 2017",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1672,9 +2266,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "3689b3ec615ae8ea26856a7e326987ce",
- "text": "Unlike other power sources, nuclear energy helps us reduce our total footprint, going beyond just the environment. When accounting for factors such as cost (e.g. fuel and construction costs), carbon (lifecycle greenhouse gas emissions), water and land footprints, nuclear is far ahead of all other energy generators.",
+ "type": "FigureCaption",
+ "element_id": "763c44446ac4be17007bbdb28f62e3fc",
+ "text": "Figure 6. The lasting decarbonization of French electricity and nuclear\u2019s ability to meet growing demand x",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1695,8 +2289,8 @@
},
{
"type": "NarrativeText",
- "element_id": "4ae8fb0de2a9299285a47864f4e352d7",
- "text": "Nuclear energy offers a multitude of services beyond just electricity. With nuclear, we can decarbonize the way we heat our homes, provide process heat for industry, and ensure access to clean water. As freshwater supplies come under increasing pressure worldwide, nuclear reactors can provide desalination, ensuring a reliable flow of fresh drinking water in areas where it is scarce.",
+ "element_id": "1fd1c9ddbac31e1a8c7a08ae647a65d4",
+ "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy \u2013 enhanced independence and security in uncertain times.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1716,9 +2310,9 @@
}
},
{
- "type": "Header",
- "element_id": "d7bc33bc5c4eb3d83f8a56f968d1f232",
- "text": "7",
+ "type": "Image",
+ "element_id": "66e361f472041d2cb74ac1acc2f78816",
+ "text": "One fuel pellet contains as much energy as a tonne of coal ",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1738,15 +2332,15 @@
}
},
{
- "type": "Header",
- "element_id": "d484deba727a0cb854f93482219a24df",
- "text": "8",
+ "type": "NarrativeText",
+ "element_id": "8c625352fad54a7372c9181d4a378317",
+ "text": "Unlike other power sources, nuclear energy helps us reduce our total footprint, going beyond just the environment. When accounting for factors such as cost (e.g. fuel and construction costs), carbon (lifecycle greenhouse gas emissions), water and land footprints, nuclear is far ahead of all other energy generators.",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 10,
+ "page_number": 9,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1761,14 +2355,14 @@
},
{
"type": "NarrativeText",
- "element_id": "f11771cc97b73e3ec86beb7752b20371",
- "text": "Nuclear energy can be relied upon to power the new mobility revolution taking place. Every day, we use almost 20 million barrels of oil to power our vehicles. By swapping to an electric or hydrogen-powered transport fleet \u2013 all powered by the atom \u2013 we are able to address one of the key challenges to a sustainable economy.",
+ "element_id": "8b00a9cb769b0010442358c07b7cf61f",
+ "text": "Nuclear energy offers a multitude of services beyond just electricity. With nuclear, we can decarbonize the way we heat our homes, provide process heat for industry, and ensure access to clean water. As freshwater supplies come under increasing pressure worldwide, nuclear reactors can provide desalination, ensuring a reliable flow of fresh drinking water in areas where it is scarce.",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
- "page_number": 10,
+ "page_number": 9,
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
"version": "8570bd087066350a84dd8d0ea86f11c6",
@@ -1782,9 +2376,31 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "945c8c5aeca9696e2a3c0a9b2de0e1aa",
- "text": "We cannot afford to wait \u2013 we need every part of the puzzle to contribute towards solving some of the greatest challenges faced by humankind in a very long time. The impacts of climate change will hit the poorest and most vulnerable first, and failing to act will have significant humanitarian consequences.",
+ "type": "Header",
+ "element_id": "7f6b3bc5618ee545017d747a4b96f093",
+ "text": "7",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 9,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
+ "version": "8570bd087066350a84dd8d0ea86f11c6",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196636.0",
+ "date_modified": "1676196636.0"
+ }
+ }
+ },
+ {
+ "type": "Header",
+ "element_id": "d484deba727a0cb854f93482219a24df",
+ "text": "8",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1805,8 +2421,8 @@
},
{
"type": "NarrativeText",
- "element_id": "b79b5b8e937c1a714d3286c6d70881aa",
- "text": "Nuclear power is the silent giant of today\u2019s energy system \u2013 it runs quietly in the background, capable of delivering immense amounts of power, regardless of weather or season, allowing us to focus on everything else in life. It is a technology that is available now, and can be expanded quickly across the world to help us solve some of the most defining challenges we face. Nuclear energy holds the potential to herald a new, cleaner and truly sustainable world \u2013 enabling us to pass on a cleaner planet to our children.",
+ "element_id": "f11771cc97b73e3ec86beb7752b20371",
+ "text": "Nuclear energy can be relied upon to power the new mobility revolution taking place. Every day, we use almost 20 million barrels of oil to power our vehicles. By swapping to an electric or hydrogen-powered transport fleet \u2013 all powered by the atom \u2013 we are able to address one of the key challenges to a sustainable economy.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1826,9 +2442,9 @@
}
},
{
- "type": "Image",
- "element_id": "936ab104c18b374610d851e57072f45e",
- "text": "",
+ "type": "NarrativeText",
+ "element_id": "945c8c5aeca9696e2a3c0a9b2de0e1aa",
+ "text": "We cannot afford to wait \u2013 we need every part of the puzzle to contribute towards solving some of the greatest challenges faced by humankind in a very long time. The impacts of climate change will hit the poorest and most vulnerable first, and failing to act will have significant humanitarian consequences.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1848,9 +2464,9 @@
}
},
{
- "type": "Title",
- "element_id": "d88e18fce3a0a1e4f483fc03555b2dbd",
- "text": "References",
+ "type": "NarrativeText",
+ "element_id": "b79b5b8e937c1a714d3286c6d70881aa",
+ "text": "Nuclear power is the silent giant of today\u2019s energy system \u2013 it runs quietly in the background, capable of delivering immense amounts of power, regardless of weather or season, allowing us to focus on everything else in life. It is a technology that is available now, and can be expanded quickly across the world to help us solve some of the most defining challenges we face. Nuclear energy holds the potential to herald a new, cleaner and truly sustainable world \u2013 enabling us to pass on a cleaner planet to our children.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1870,9 +2486,9 @@
}
},
{
- "type": "Title",
- "element_id": "97b38e1436971a086af6f1e3fa907126",
- "text": "i",
+ "type": "Image",
+ "element_id": "936ab104c18b374610d851e57072f45e",
+ "text": "",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1893,8 +2509,8 @@
},
{
"type": "Title",
- "element_id": "112b81b1155f39c21f5cc5ae789f2acd",
- "text": "ii",
+ "element_id": "d88e18fce3a0a1e4f483fc03555b2dbd",
+ "text": "References",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1914,9 +2530,9 @@
}
},
{
- "type": "Title",
- "element_id": "25b3d0bfc74512aec787c31f5889ae32",
- "text": "iii",
+ "type": "ListItem",
+ "element_id": "1de41ba699195c6548f3f6f93c8993c4",
+ "text": "i International Energy Agency (2018), World Energy Outlook 2018. Data accessed from https://www.iea.org/weo/ \u2013 Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the results likely to stem from the implementation of announced policy intentions \u2013 with visual modification by World Nuclear Association.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1936,9 +2552,9 @@
}
},
{
- "type": "Title",
- "element_id": "7806e49f513bb91010cfa16fd65d1e27",
- "text": "iv",
+ "type": "ListItem",
+ "element_id": "f1d85d4bef3bf48fc8b292673151f14a",
+ "text": "ii International Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=WORLD&year=2016&category=Electricity&indicator=ElecGenByFuel&mode =chart&dataTable=ELECTRICITYANDHEAT \u2013 with visual modifications by World Nuclear Association.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1958,9 +2574,9 @@
}
},
{
- "type": "Title",
- "element_id": "84643eaf522b55520d3286fc650b8565",
- "text": "Vv",
+ "type": "ListItem",
+ "element_id": "85b2a061c21fe3c6368048516aecea3e",
+ "text": "iii International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1980,9 +2596,9 @@
}
},
{
- "type": "Title",
- "element_id": "65e28360f4a9960fd1a079d3bc38f5fa",
- "text": "vi",
+ "type": "ListItem",
+ "element_id": "450a01b96badb58fa247323a8624d033",
+ "text": "iv Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 \u00b0C. Accessed from: https://www.ipcc.ch/sr15/",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2002,9 +2618,9 @@
}
},
{
- "type": "Title",
- "element_id": "d925df783207d4d09021cab09a5f2799",
- "text": "vii",
+ "type": "ListItem",
+ "element_id": "8e0d519ee7f210f670c90fd7cfc19366",
+ "text": "v International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2024,9 +2640,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "efc2aec0f176f924eeba5e86eb6691ba",
- "text": "International Energy Agency (2018), World Energy Outlook 2018. Data accessed from https://www.iea.org/weo/ \u2013 Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the results likely to stem from the implementation of announced policy intentions \u2013 with visual modification by World Nuclear Association. International Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=WORLD&year=2016&category=Electricity&indicator=ElecGenByFuel&mode =chart&dataTable=ELECTRICITYANDHEAT \u2013 with visual modifications by World Nuclear Association. International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 \u00b0C. Accessed from: https://www.ipcc.ch/sr15/ International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ International Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs of generating Electricity \u2013 2015 Edition. Accessed from: https://www.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf International Atomic Energy Agency (2015), Technical challenges in the application and licensing of digital instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Publications/PDF/P1695_web.pdf",
+ "type": "ListItem",
+ "element_id": "957bb703bf86ab0b572e6c2edd3129be",
+ "text": "vi International Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs of generating Electricity \u2013 2015 Edition. Accessed from: https://www.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2047,8 +2663,8 @@
},
{
"type": "ListItem",
- "element_id": "01a35264bb99e41c06ee8164588c3c0d",
- "text": "viii Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)",
+ "element_id": "b18e0f9ae742a9c86f8ba044a917783a",
+ "text": "vii International Atomic Energy Agency (2015), Technical challenges in the application and licensing of digital instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Publications/PDF/P1695_web.pdf",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2068,9 +2684,9 @@
}
},
{
- "type": "UncategorizedText",
- "element_id": "62ac61c2f315bde0967d8b87f5a8d22f",
- "text": "and NRC SOARCA study 2015 International Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview Ibid.",
+ "type": "ListItem",
+ "element_id": "01f30481039f116a9fb3b8fac0f5f4f7",
+ "text": "viii Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012) and NRC SOARCA study 2015",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2090,9 +2706,9 @@
}
},
{
- "type": "Title",
- "element_id": "e1bd8d64a882d38da22d24986334d476",
- "text": "ix",
+ "type": "ListItem",
+ "element_id": "65a355798894c38302ade1bf736ba7f8",
+ "text": "ix International Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2113,8 +2729,8 @@
},
{
"type": "ListItem",
- "element_id": "ab8476cb57b2e0de5a27e919c27a88ee",
- "text": "x",
+ "element_id": "8456a25775974c221c6056103892ce3a",
+ "text": "x Ibid.",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -2135,7 +2751,7 @@
},
{
"type": "NarrativeText",
- "element_id": "95f8be8e7552c5b5045246a1f7df0e0a",
+ "element_id": "d5a24eafb6e08190a1cd693abb8ae0fd",
"text": "Photo credits: Front cover: Mike Baird; page 2: Vattenfall; page 4: Getty Images; page 5: Adobe Stock; page 6: Rosatom; page 8: Dean Calma, IAEA; page 10: Kazatomprom; page 11: EDF.",
"metadata": {
"filetype": "application/pdf",
diff --git a/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json
index 49147dbf2c..85e7118d4f 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json
@@ -331,8 +331,8 @@
},
{
"type": "NarrativeText",
- "element_id": "ab8cefbb53c308302ee0e3c0c7ecfd25",
- "text": "Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transfor- mation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax dmodel. linear transformation, similar to [30]. In the embedding layers, we multiply those weights by",
+ "element_id": "ebdf8de46645084127f7ff7b24ed87e9",
+ "text": "Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transfor- mation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [30]. In the embedding layers, we multiply those weights by \u221a dmodel.",
"metadata": {
"filetype": "application/pdf",
"languages": [
diff --git a/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json
index fa25d598c0..a4d6e9f063 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json
@@ -375,8 +375,8 @@
},
{
"type": "NarrativeText",
- "element_id": "31ed39cf3f959ddf86d3eba65cb79a01",
- "text": "Nuclear energy and the risk of radiation is one of the most extreme cases in which perceived and actual risks have diverged. The fear of radiation, whilst pre- dating the Second World War, was firmly established by the debate on the potential impacts of low-dose radiation from the fallout from nuclear weapons testing in the early years of the Cold War. Radiation in many ways became linked with the mental imagery of nuclear war, playing an important role in increasing public concern about radiation and its health effects. There is a well-established discrepancy between fact-based risk assessments and public perception of different risks. This is very much the case with nuclear power, and this is clearly highlighted in Figure 1, with laypersons ranking nuclear power as the highest risk out of 30 activities and technologies, with experts ranking nuclear as 20th. In many ways, popular culture\u2019s depiction of radiation has played a role in ensuring that this discrepancy has remained, be it Godzilla, The Incredible Hulk, or The Simpsons, which regularly plays on the notion of radiation from nuclear power plants causing three-eyed fish, something that has been firmly rejected as unscientific.",
+ "element_id": "75f32a291a5cbb11d3183eac5fb426c3",
+ "text": "Nuclear energy and the risk of radiation is one of the most extreme cases in which perceived and actual risks have diverged. The fear of radiation, whilst pre- dating the Second World War, was firmly established by the debate on the potential impacts of low-dose radiation from the fallout from nuclear weapons testing in the early years of the Cold War. Radiation in many ways became linked with the mental imagery of nuclear war, playing an important role in increasing public concern about radiation and its health effects. There is a well-established discrepancy between fact-based risk assessments and public perception of different risks. This is very much the case with nuclear power, and this is clearly highlighted in Figure 1, with laypersons ranking nuclear power as the highest risk out of 30 activities and technologies, with experts ranking nuclear as 20th. In many ways, popular culture\u2019s depiction of radiation has played a role in ensuring that this discrepancy has remained,",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -397,8 +397,8 @@
},
{
"type": "Title",
- "element_id": "a66214340855880a5393384d1363511c",
- "text": "Rank Order Laypersons",
+ "element_id": "bf248ce5194cc4686f97a2769cd9744a",
+ "text": "Rank Order",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -419,8 +419,30 @@
},
{
"type": "Table",
- "element_id": "9512f477364e1da1fa60dbd237c41f85",
- "text": "Experts 1 20 Nuclear power Motor vehicles 2 1 4 3 Handguns 2 4 Smoking Electric power (non-nuclear) 9 17 22 7 X-rays 25 30 Vaccinations",
+ "element_id": "fe081bc8fc80f0df977f46493f0e9430",
+ "text": "Laypersons Experts 1 Nuclear power 20 2 Motor vehicles 1 3 Handguns 4 4 Smoking 2 17 Electric power (non-nuclear) 9 22 X-rays 7 30 Vaccinations 25",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "FigureCaption",
+ "element_id": "a6b2ef41c3420b21165799903ccece40",
+ "text": "Figure 1. Ordering of perceived risks for 30 activities and technologies1,iii",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -441,7 +463,29 @@
},
{
"type": "NarrativeText",
- "element_id": "3ff36869cefb14183f0955094a908fc5",
+ "element_id": "6a806ec2935a7f5da409a3f892006982",
+ "text": "be it Godzilla, The Incredible Hulk, or The Simpsons, which regularly plays on the notion of radiation from nuclear power plants causing three-eyed fish, something that has been firmly rejected as unscientific.",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 4,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "b794118cfefdf05953a1eab3ecb49928",
"text": "In reality, radiation is a natural part of life; indeed, we are all exposed to radiation every day, on average receiving 2-3 millisieverts (mSv) per year. Most of this radiation is naturally occurring, with radon gas from the ground being the main source of exposure. The nuclear industry is responsible for a very small part of radiation exposure to the public, as seen in Figure 2. To put this into perspective, eating 10 bananas or two Brazil nuts results in the same radiation dose as living nearby a nuclear power plant for a year. Humans are also naturally radioactive, and the radiation dose from sleeping next to someone else each night for a year is ten times higher than the exposure from living nearby a nuclear power plant for the same time span.",
"metadata": {
"filetype": "application/pdf",
@@ -463,7 +507,7 @@
},
{
"type": "NarrativeText",
- "element_id": "773667f6d4fbe19cc347ace06ca3664e",
+ "element_id": "60d4e572d6bfa2687304ba3b28f12fcf",
"text": "In fact, scientific consensus is that when it comes to preventing exposure to radiation, nuclear power is much better than other electricity generators. A 2016 reportiii from the United Nations Scientific Committee on the Effects of Atomic Radiation (UNSCEAR) found that coal-generated electricity is responsible for more than half of the total global radiation exposure arising from electricity generation, while nuclear power contributed less than a fifth. Coal miners received high occupational exposure and workers in solar and wind farms received the highest occupational exposure associated with plant construction for the same amount of installed capacity.",
"metadata": {
"filetype": "application/pdf",
@@ -485,7 +529,7 @@
},
{
"type": "NarrativeText",
- "element_id": "6dab7cb99fa308838e8c0413caccb7f1",
+ "element_id": "9600ead8e685b080a9ae84455b6be4f9",
"text": "1 The original study was published in 1978, but its findings have been confirmed by numerous studies since.",
"metadata": {
"filetype": "application/pdf",
@@ -507,8 +551,8 @@
},
{
"type": "Image",
- "element_id": "57a9b2172894596e88b48caac276416d",
- "text": "Natural Artificial 48% Radon 14% Buildings & soil 12% Food & water 10% Cosmic 4% Thoron 11% Medicine 0.4% 0.4% Miscellaneous 0.2% Occupational 0.04% Nuclear discharges Fallout ",
+ "element_id": "695e4e06071f6ed026e30f329659adff",
+ "text": "Natural Artificial 48% Radon 11% Medicine 14% Buildings & soil 0.4% Fallout 12% Food & water 0.4% Miscellaneous 10% Cosmic 0.2% Occupational 4% Thoron 0.04% Nuclear discharges ",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -616,9 +660,9 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "6c9fe7851d0f4e06c5ec939f53dbce3b",
- "text": "r a e y",
+ "type": "Title",
+ "element_id": "a8706e82b3f90cffc996a24348e3b670",
+ "text": "r",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -639,8 +683,8 @@
},
{
"type": "Title",
- "element_id": "a70b649d3f49fafd8a15a6617364bd69",
- "text": "W T",
+ "element_id": "da631c23500655c51b9311a61f55744f",
+ "text": "a",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -660,9 +704,383 @@
}
},
{
- "type": "NarrativeText",
- "element_id": "26963be98ae7ff2e8c0428862d074cf6",
- "text": "r e p s e i t i l",
+ "type": "Title",
+ "element_id": "d78a11e9e55235934c3a4922053c68e5",
+ "text": "e",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "8d14df8b7fd7744365fbf8e02d69415a",
+ "text": "y",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "f4df01bee1b8ffb973ac8539649c5189",
+ "text": "W",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "b733cf49de269e22bed7c9883b958669",
+ "text": "T",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "c4b47d788b26c3d5c62ad462ed3ca2db",
+ "text": "r",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "bff4435574259239761670b31432cc8a",
+ "text": "e",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "8ba15a3a71eb0bb689c582098cce6730",
+ "text": "p",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "5fde097ba00ad7647206ae11c721d28c",
+ "text": "s",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "81331ee9da4145c2651d6483696fe966",
+ "text": "8",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "81f1f3b9da6df38d938bf7871fa069b5",
+ "text": "e",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "aa4a79651a9a0087b66fcc40a2213113",
+ "text": "i",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "6d1c0d05d3a424b43d9572188a76c2d4",
+ "text": "t",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "392a17b2f3eba46f4bcf078e0b204514",
+ "text": "i",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "d24a9a771e46fdd6b269f1ecaf0b5eec",
+ "text": "l",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "9dc4537afa8ae0b959a542f9ba5c1e03",
+ "text": "S",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "919dac2487a4c860747318a132a54a72",
+ "text": "a",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "04ee5d05c3fcfffd945762e803478600",
+ "text": "t",
+ "metadata": {
+ "filetype": "application/pdf",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 5,
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
+ "version": "e690f37ef36368a509d150f373a0bbe0",
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
+ },
+ "date_created": "1676196572.0",
+ "date_modified": "1676196572.0"
+ }
+ }
+ },
+ {
+ "type": "Title",
+ "element_id": "63dabde368e2cf310d20a885fe50314a",
+ "text": "a",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -683,8 +1101,8 @@
},
{
"type": "Title",
- "element_id": "b69c60ea2e3fa24e25a069f90ee4b696",
- "text": "a t a F",
+ "element_id": "796538927664e4d87312c428469428f5",
+ "text": "F",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -705,7 +1123,7 @@
},
{
"type": "FigureCaption",
- "element_id": "af241d12aef0f51bace400db4e14649d",
+ "element_id": "d1496d2dc28f6877646e280c0c47e9ab",
"text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3",
"metadata": {
"filetype": "application/pdf",
@@ -727,7 +1145,7 @@
},
{
"type": "NarrativeText",
- "element_id": "1e41836d6d7be638be9c0de0ce2c2256",
+ "element_id": "76619db169f10599a1fb73a13fdebafb",
"text": "Contrary to perceptions, nuclear is an incredibly safe source of energy (see Figure 3 for a comparison). What is also clear is that the continued use of alternative energy sources in preference to nuclear energy \u2013 in particular fossil fuels \u2013 poses a far greater risk to public health by significantly contributing to climate change and air pollution.",
"metadata": {
"filetype": "application/pdf",
@@ -749,7 +1167,7 @@
},
{
"type": "ListItem",
- "element_id": "59b04b23d82c7c013abd6477a14c9425",
+ "element_id": "ffdd87176fddbe78d853186bf86602ea",
"text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. 3 Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the 2012 UNSCEAR report and the 2015 US NRC SOARCA study.",
"metadata": {
"filetype": "application/pdf",
@@ -771,7 +1189,7 @@
},
{
"type": "Header",
- "element_id": "9a8b3b64f6d252a6d31d87d306952ca2",
+ "element_id": "8d78bc83fa3b857721631a0491d6039b",
"text": "3",
"metadata": {
"filetype": "application/pdf",
@@ -1057,8 +1475,8 @@
},
{
"type": "Image",
- "element_id": "dc83c2d2395c30a8785a2533424f1c72",
- "text": "Plant-level production costs at market prices Grid-level costs of the electricity system Social and environmental costs of emissions, land-use, climate change, security of supply, etc. ",
+ "element_id": "96aca413098163140d5213641ae01231",
+ "text": "Plant-level Social and production costs Grid-level costs environmental costs of at market prices of the electricity emissions, land-use, system climate change, security of supply, etc. ",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1519,8 +1937,8 @@
},
{
"type": "ListItem",
- "element_id": "a3be0ee530629ee7a2413f05eb0cce76",
- "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712",
+ "element_id": "2ab37467d413d491735b002a679afdb8",
+ "text": "ii BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1585,8 +2003,8 @@
},
{
"type": "ListItem",
- "element_id": "4ca71e69090af4ad16216a0ddcc0a168",
- "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018",
+ "element_id": "f8c502221064df965c932a0b76e0717b",
+ "text": "v International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018",
"metadata": {
"filetype": "application/pdf",
"languages": [
@@ -1607,8 +2025,8 @@
},
{
"type": "ListItem",
- "element_id": "16bb7fca4ab44ffc73e847ea7b93fc4d",
- "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8",
+ "element_id": "deb6d3d9e6eae5a2256fbc12db133555",
+ "text": "vi Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8",
"metadata": {
"filetype": "application/pdf",
"languages": [
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 6e96b496a9..782ceee513 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.12-dev1" # pragma: no cover
+__version__ = "0.15.12-dev2" # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 618379edac..a99e5855d0 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -7,6 +7,7 @@
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
from unstructured.partition.pdf_image.pdfminer_utils import (
extract_image_objects,
+ extract_text_objects,
open_pdfminer_pages_generator,
rect_to_bbox,
)
@@ -57,15 +58,23 @@ def process_data_with_pdfminer(
layout: list["TextRegion"] = []
for obj in page_layout:
- x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
-
if hasattr(obj, "get_text"):
- _text = obj.get_text()
- text_region = _create_text_region(
- x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion
- )
- if text_region.bbox is not None and text_region.bbox.area > 0:
- layout.append(text_region)
+ inner_text_objects = extract_text_objects(obj)
+ for inner_obj in inner_text_objects:
+ _text = inner_obj.get_text()
+ new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(inner_obj.bbox, height)
+ text_region = _create_text_region(
+ new_x1,
+ new_y1,
+ new_x2,
+ new_y2,
+ coef,
+ _text,
+ Source.PDFMINER,
+ EmbeddedTextRegion,
+ )
+ if text_region.bbox is not None and text_region.bbox.area > 0:
+ layout.append(text_region)
else:
inner_image_objects = extract_image_objects(obj)
for img_obj in inner_image_objects:
@@ -76,6 +85,8 @@ def process_data_with_pdfminer(
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
+ layout = remove_duplicate_embedded_text(layout)
+
# NOTE(christine): always do the basic sort first for deterministic order across
# python versions.
layout = sort_text_regions(layout, SORT_MODE_BASIC)
@@ -292,6 +303,38 @@ def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "Docu
return document
+@requires_dependencies("unstructured_inference")
+def remove_duplicate_embedded_text(elements: list["TextRegion"]) -> list["TextRegion"]:
+ """Removes duplicate text elements extracted by PDFMiner from a document layout."""
+ from unstructured_inference.inference.elements import EmbeddedTextRegion
+
+ bboxes = []
+ texts = []
+ bbox_to_iou_mapping = {}
+ current_idx = 0
+ for i, element in enumerate(elements):
+ if not isinstance(element, EmbeddedTextRegion):
+ continue
+ bboxes.append(element.bbox)
+ texts.append(element.text)
+ bbox_to_iou_mapping[i] = current_idx
+ current_idx += 1
+
+ iou = boxes_self_iou(bboxes, env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD)
+
+ filtered_elements = []
+ for i, element in enumerate(elements):
+ if not isinstance(element, EmbeddedTextRegion):
+ filtered_elements.append(element)
+ continue
+ this_idx = bbox_to_iou_mapping[i]
+ if iou[this_idx, this_idx + 1 :].any():
+ continue
+ filtered_elements.append(element)
+
+ return filtered_elements
+
+
def aggregate_embedded_text_by_block(
text_region: "TextRegion",
pdf_objects: list["TextRegion"],
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index c5cf7f8326..23332745e6 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -3,7 +3,7 @@
from typing import BinaryIO, List, Tuple
from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem
+from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PSSyntaxError
@@ -34,6 +34,19 @@ def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
return objects
+def extract_text_objects(parent_object: LTItem) -> List[LTTextLine]:
+ """Recursively extracts text objects from a given parent object in a PDF document."""
+ objects = []
+
+ if isinstance(parent_object, LTTextLine):
+ objects.append(parent_object)
+ elif isinstance(parent_object, LTContainer):
+ for child in parent_object:
+ objects.extend(extract_text_objects(child))
+
+ return objects
+
+
def rect_to_bbox(
rect: Tuple[float, float, float, float],
height: float,
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index 89c60cc78f..965f060b5c 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -150,6 +150,11 @@ def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
"""
return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)
+ @property
+ def EMBEDDED_TEXT_SAME_REGION_THRESHOLD(self) -> float:
+ """threshold to consider the bounding boxes of two embedded images as the same region"""
+ return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.9)
+
@property
def PDF_ANNOTATION_THRESHOLD(self) -> float:
"""The threshold value (between 0.0 and 1.0) that determines the minimum overlap required