Skip to content

Commit

Permalink
Adding percent-of-readable-pages validation for PDFs (#4223)
Browse files Browse the repository at this point in the history
* Adding percent-of-readable-pages validation for PDFs

* Lint

* Adding unit test
  • Loading branch information
phildominguez-gsa committed Aug 30, 2024
1 parent dffbf33 commit a8d35a5
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 7 deletions.
Binary file not shown.
6 changes: 6 additions & 0 deletions backend/audit/test_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,9 +681,15 @@ def test_locked_pdf_file(self):
self.assertRaises(ValidationError, validate_pdf_file_integrity, file)

def test_scanned_pdf_file(self):
"""PDF files that have too few parsable characters are invalid"""
with open("audit/fixtures/scanned.pdf", "rb") as file:
self.assertRaises(ValidationError, validate_pdf_file_integrity, file)

def test_not_enough_readable_pages_pdf_file(self):
"""PDF files whose percentage of readable pages is too low are invalid"""
with open("audit/fixtures/not-enough-readable-pages.pdf", "rb") as file:
self.assertRaises(ValidationError, validate_pdf_file_integrity, file)

def test_valid_pdf_file(self):
with open("audit/fixtures/basic.pdf", "rb") as file:
validate_pdf_file_integrity(file)
Expand Down
25 changes: 18 additions & 7 deletions backend/audit/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ def validate_single_audit_report_file_extension(file):
def validate_pdf_file_integrity(file):
"""Files must be readable PDFs"""
MIN_CHARARACTERS_IN_PDF = 6000
MIN_PERCENT_READABLE_PAGES = 0.50

try:
reader = PdfReader(file)
Expand All @@ -672,17 +673,27 @@ def validate_pdf_file_integrity(file):
"We were unable to process the file you uploaded because it is encrypted."
)

text_length = 0
total_chars = 0
num_pages_with_text = 0

for page in reader.pages:
page_text = page.extract_text()
text_length += len(page_text)
# If we find enough characters, we're content.
if text_length >= MIN_CHARARACTERS_IN_PDF:
break
total_chars += len(page_text)
num_pages_with_text += 1 if len(page_text) else 0

percent_readable_pages = num_pages_with_text / len(reader.pages)

if text_length < MIN_CHARARACTERS_IN_PDF:
if total_chars == 0:
raise ValidationError(
"We were unable to process the file you uploaded because it contains no readable text."
)
elif total_chars < MIN_CHARARACTERS_IN_PDF:
raise ValidationError(
"We were unable to process the file you uploaded because it contains too little readable text."
)
elif percent_readable_pages < MIN_PERCENT_READABLE_PAGES:
raise ValidationError(
"We were unable to process the file you uploaded because it contains no readable text or too little text."
f"We were unable to process the file you uploaded because only {percent_readable_pages:.0%} of the pages contain readable text (minimum {MIN_PERCENT_READABLE_PAGES:.0%} required.)"
)

except ValidationError:
Expand Down

0 comments on commit a8d35a5

Please sign in to comment.