-
Notifications
You must be signed in to change notification settings - Fork 818
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: utility script to process large PDFs through the API by script (#…
…3591) Adds the bash script `process-pdf-parallel-through-api.sh` that allows splitting up a PDF into smaller parts (splits) to be processed through the API concurrently, and is re-entrant. If any of the parts splits fail to process, one can attempt reprocessing those split(s) by rerunning the script. Note: requires the `qpdf` command line utility. The below command line output shows the scenario where just one split had to be reprocessed through the API to create the final `layout-parser-paper_combined.json` output. ``` $ BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res \ ./scripts/user/process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf > % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-pars\ er-paper_pages_1_to_6.json as it already exists. Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_7_to_12.json as it already exists. Valid JSON output created: /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_13_to_16.json Processing complete. Combined JSON saved to /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_combined.json ``` Bonus change to `unstructured-get-json.sh` to point to the standard hosted Serverless API, but allow using the Free API with --freemium.
- Loading branch information
Showing
3 changed files
with
183 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Usage: ./process-pdf-parallel-through-api.sh filename.pdf | ||
|
||
set -eu -o pipefail | ||
|
||
if [ $# -ne 1 ]; then | ||
echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently." | ||
echo | ||
echo "Usage: $0 <pdf_filename>" | ||
echo "Please provide a PDF filename as the first argument." | ||
echo | ||
echo "Optionally, set the following env vars: " | ||
echo | ||
echo "* STRATEGY (default hi_res)" | ||
echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel" | ||
echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split" | ||
echo | ||
echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf" | ||
exit 1 | ||
fi | ||
|
||
ALLOWED_STRATEGIES=("hi_res" "fast" "auto") | ||
|
||
# Validate STRATEGY environment variable if it's set | ||
if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then | ||
echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2 | ||
exit 1 | ||
fi | ||
|
||
# Check if UNST_API_KEY is set | ||
if [ -z "${UNST_API_KEY}" ]; then | ||
echo "Error: UNST_API_KEY is not set or is empty" >&2 | ||
exit 1 | ||
fi | ||
|
||
PDF_FILE="$1" | ||
DEFAULT_SPLIT_SIZE=10 | ||
SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE} | ||
PDF_NAME=$(basename "$PDF_FILE" .pdf) | ||
DEFAULT_DIR="$HOME/tmp/pdf-splits" | ||
PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}" | ||
MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }') | ||
PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}" | ||
PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}" | ||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||
|
||
# Check if PDF parts directory exists | ||
if [ ! -d "$PDF_DIR" ]; then | ||
"$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE" | ||
fi | ||
|
||
# Create output directory if it does not exist | ||
mkdir -p "$PDF_OUTPUT_DIR" | ||
|
||
incomplete=0 # Flag to track incomplete processing | ||
|
||
# Function to process a single PDF part file | ||
process_file_part() { | ||
local file="$1" | ||
local STARTING_PAGE_NUMBER="$2" | ||
local OUTPUT_JSON="$3" | ||
|
||
if [ -f "$OUTPUT_JSON" ]; then | ||
echo "Skipping processing for $OUTPUT_JSON as it already exists." | ||
return | ||
fi | ||
|
||
curl -q -X POST https://api.unstructuredapp.io/general/v0/general \ | ||
-H "unstructured-api-key: $UNST_API_KEY" \ | ||
-H 'accept: application/json' \ | ||
-H 'Content-Type: multipart/form-data' \ | ||
-F strategy="${STRATEGY:-hi_res}" \ | ||
-F 'skip_infer_table_types="[]"' \ | ||
-F starting_page_number="$STARTING_PAGE_NUMBER" \ | ||
-F files=@"$file;filename=$PDF_FILE" \ | ||
-o "$OUTPUT_JSON" | ||
|
||
# Verify JSON content | ||
if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then | ||
echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file." | ||
cat "$OUTPUT_JSON" | ||
rm "$OUTPUT_JSON" | ||
incomplete=1 | ||
else | ||
echo "Valid JSON output created: $OUTPUT_JSON" | ||
fi | ||
} | ||
|
||
# Function to process a batch of files | ||
process_batch() { | ||
for file in "$@"; do | ||
local START_PAGE | ||
START_PAGE=$(echo "$file" | sed -n 's/.*_pages_\([0-9]*\)_to_[0-9]*.pdf/\1/p') | ||
local END_PAGE= | ||
END_PAGE=$(echo "$file" | sed -n 's/.*_pages_[0-9]*_to_\([0-9]*\).pdf/\1/p') | ||
local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json" | ||
process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" & | ||
done | ||
wait | ||
} | ||
|
||
# Read PDF parts into an array | ||
mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print) | ||
|
||
# Process PDF parts in batches of 30, by default | ||
batch_size=${BATCH_SIZE:-30} | ||
for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do | ||
process_batch "${pdf_parts[@]:i:batch_size}" | ||
done | ||
|
||
# Determine the output filename based on whether processing was incomplete | ||
if [ "$incomplete" -eq 1 ]; then | ||
combined_output_filename="${PDF_NAME}_incomplete_combined.json" | ||
echo "WARNING! not all json parts were successfully processed. you may rerun this script" | ||
echo "to attempt reprocessing those (failed to process) parts." | ||
else | ||
combined_output_filename="${PDF_NAME}_combined.json" | ||
fi | ||
|
||
# Combine JSON outputs in numerical order | ||
find "$PDF_OUTPUT_DIR" -name '*.json' -print0 | sort -zV | xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename" | ||
|
||
echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Usage: ./split_pdf.sh filename.pdf | ||
|
||
set -e | ||
|
||
PDF_FILE="$1" | ||
DEFAULT_SPLIT_SIZE=5 | ||
SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE} | ||
|
||
# Validate that SPLIT_SIZE is an integer | ||
if ! [[ "$SPLIT_SIZE" =~ ^[0-9]+$ ]]; then | ||
echo "Error: PDF_SPLIT_PAGE_SIZE must be an integer." | ||
exit 1 | ||
fi | ||
|
||
DEFAULT_DIR="$HOME/tmp/pdf-splits" | ||
PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}" | ||
PDF_NAME=$(basename "$PDF_FILE" .pdf) | ||
MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }') | ||
PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}" | ||
|
||
# Create directory if it does not exist | ||
mkdir -p "$PDF_DIR" | ||
|
||
# Total number of pages | ||
TOTAL_PAGES=$(qpdf --show-npages "$PDF_FILE") | ||
|
||
# Split PDF into $SPLIT_SIZE-page chunks | ||
START_PAGE=1 | ||
while [ "$START_PAGE" -le "$TOTAL_PAGES" ]; do | ||
END_PAGE=$((START_PAGE + SPLIT_SIZE - 1)) | ||
if [ "$END_PAGE" -gt "$TOTAL_PAGES" ]; then | ||
END_PAGE=$TOTAL_PAGES | ||
fi | ||
OUTPUT_FILE="$PDF_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.pdf" | ||
qpdf "$PDF_FILE" --pages . "$START_PAGE"-"$END_PAGE" -- "$OUTPUT_FILE" | ||
echo "Created $OUTPUT_FILE" | ||
START_PAGE=$((END_PAGE + 1)) | ||
done | ||
|
||
echo "All parts have been saved to $PDF_DIR" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters