-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf-image-extractor.py
120 lines (92 loc) · 4.26 KB
/
pdf-image-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import argparse
import fitz # PyMuPDF
from PIL import Image
import io
import hashlib
from tqdm import tqdm
def extract_images_from_pdf(pdf_path, output_dir, min_size=100):
"""
Extract all images from a PDF file and save them to the output directory.
Args:
pdf_path: Path to the PDF file
output_dir: Directory to save extracted images
min_size: Minimum pixel size for images to extract (filters out tiny images)
"""
# Create a unique subfolder for this PDF to avoid name conflicts
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
pdf_output_dir = os.path.join(output_dir, pdf_name)
os.makedirs(pdf_output_dir, exist_ok=True)
print(f"\nProcessing: {pdf_path}")
# Open the PDF
doc = fitz.open(pdf_path)
image_count = 0
extracted_images = set() # Track image hashes to avoid duplicates
# Create progress bar for pages
pbar = tqdm(total=len(doc), desc="Extracting images", unit="page")
# Iterate through each page
for page_num, page in enumerate(doc):
# Get all images on the page
image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list):
xref = img_info[0] # Image reference number
# Extract the image data
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
# Generate hash to identify duplicate images
img_hash = hashlib.md5(image_bytes).hexdigest()
if img_hash in extracted_images:
continue
extracted_images.add(img_hash)
# Try to identify the image type
image_ext = base_image["ext"]
# Load image with PIL to verify it's valid and check dimensions
try:
img = Image.open(io.BytesIO(image_bytes))
width, height = img.size
# Skip small images (often icons, bullets, etc.)
if width < min_size or height < min_size:
continue
# Save the image
image_count += 1
img_filename = f"page{page_num+1}_img{img_index}_{width}x{height}.{image_ext}"
img_path = os.path.join(pdf_output_dir, img_filename)
with open(img_path, "wb") as img_file:
img_file.write(image_bytes)
pbar.set_postfix({"images": image_count})
except Exception as e:
print(f"\nError extracting image: {e}")
pbar.update(1)
pbar.close()
print(f"Extracted {image_count} unique images from {pdf_path}")
return image_count
def main():
# Set up command line arguments
parser = argparse.ArgumentParser(description='Extract images from all PDFs in a directory')
parser.add_argument('input_dir', nargs='?', help='Directory containing PDF files (default: ./pdfs)',
default='./pdfs')
parser.add_argument('--output_dir', help='Directory to save extracted images (default: ./extracted_images)',
default='./extracted_images')
parser.add_argument('--min_size', type=int, help='Minimum pixel dimension for images (default: 100)',
default=100)
args = parser.parse_args()
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Find PDF files recursively
pdf_files = []
for root, _, files in os.walk(args.input_dir):
for file in files:
if file.lower().endswith('.pdf'):
pdf_files.append(os.path.join(root, file))
if not pdf_files:
print(f"No PDF files found in {args.input_dir}")
return
print(f"Found {len(pdf_files)} PDF files")
# Process each PDF with progress bar
total_images = 0
for pdf_path in tqdm(pdf_files, desc="Processing PDFs", unit="pdf"):
image_count = extract_images_from_pdf(pdf_path, args.output_dir, args.min_size)
total_images += image_count
print(f"\nTotal images extracted: {total_images}")
if __name__ == "__main__":
main()