Skip to content

Commit

Permalink
Fixed duplicate results in table search
Browse files Browse the repository at this point in the history
  • Loading branch information
erikkastelec committed Aug 26, 2020
1 parent fcc9260 commit 1615719
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 15 deletions.
2 changes: 1 addition & 1 deletion PDFScraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.0.10"
__version__ = "1.0.11"

import argparse
import logging
Expand Down
29 changes: 16 additions & 13 deletions PDFScraper/outputGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,20 +300,23 @@ def generate_html(output_path: str, docs: Documents, search_word: str, search_mo
tempfile_path = tempfile_path + "/table"
table.df[0].str.strip('.!? \n\t')
# perform fuzzy search over all columns
found = False
for i in range(0, table.shape[1]):
for x in process.extract(search_word, table.df[i].astype(str).values.tolist(),
scorer=fuzz.partial_ratio):
if x[1] > 80:
table.to_html(tempfile_path, classes="responsive-table", index=False)
with codecs.open(tempfile_path, 'r') as table_file:
# replace \n in table to fix formatting
tab = re.sub(r'\\n', '<br>', table_file.read())
if not header_printed:
with tag('h2'):
text("Found in document with location: " + str(document.path))
doc.asis(tab)
os.remove(tempfile_path)
break
if not found:
for x in process.extract(search_word, table.df[i].astype(str).values.tolist(),
scorer=fuzz.partial_ratio):
if x[1] > 80:
table.to_html(tempfile_path, classes="responsive-table", index=False)
with codecs.open(tempfile_path, 'r') as table_file:
# replace \n in table to fix formatting
tab = re.sub(r'\\n', '<br>', table_file.read())
if not header_printed:
with tag('h2'):
text("Found in document with location: " + str(document.path))
doc.asis(tab)
os.remove(tempfile_path)
found = True
break

# write HTML to file
# check if output path is a directory
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"yattag==1.14.0",
],
name="PDFScraper",
version="1.0.10",
version="1.0.11",
author="Erik Kastelec",
author_email="[email protected]",
description="PDF text and table search",
Expand Down

0 comments on commit 1615719

Please sign in to comment.