jsvine · jnhyperion · Aug 10, 2023 · Aug 17, 2023 · Aug 28, 2023
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -455,6 +455,24 @@ def dedupe_chars(self, **kwargs: Any) -> "FilteredPage":
         p._objects["char"] = utils.dedupe_chars(self.chars, **kwargs)
         return p
 
+    def remove_whitespace(
+        self, only_overlapping: bool = False, **kwargs: Any
+    ) -> "FilteredPage":
+        """
+        Removes all the whitespace chars.
+        When `only_overlapping=True`: only remove the whitespace chars in lines
+        which are overlapped with the following non whitespace chars.
+        """
+        if only_overlapping:
+            p = FilteredPage(self, lambda x: True)
+            p._objects = {kind: objs for kind, objs in self.objects.items()}
+            p._objects["char"] = utils.remove_overlapped_whitespace(
+                self.chars, **kwargs
+            )
+        else:
+            p = FilteredPage(self, lambda obj: obj.get("text") != " ")
+        return p
+
     def to_image(
         self,
         resolution: Optional[Union[int, float]] = None,

diff --git a/pdfplumber/utils/__init__.py b/pdfplumber/utils/__init__.py
@@ -40,4 +40,5 @@
     extract_text,
     extract_text_simple,
     extract_words,
+    remove_overlapped_whitespace,
 )
diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py
@@ -2,6 +2,7 @@
 import itertools
 import re
 import string
+from copy import deepcopy
 from operator import itemgetter
 from typing import Any, Dict, Generator, List, Match, Optional, Pattern, Tuple, Union
 
@@ -580,3 +581,38 @@ def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:
 
     deduped = yield_unique_chars(chars)
     return sorted(deduped, key=chars.index)
+
+
+def remove_overlapped_whitespace(
+    chars: T_obj_list, y_tolerance: T_num = DEFAULT_Y_TOLERANCE
+) -> T_obj_list:
+    """
+    Remove the whitespace chars in lines which are overlapped with the following non
+    whitespace chars.
+    """
+
+    def upright_key(x: T_obj) -> int:
+        return -int(x["upright"])
+
+    def yield_overlapped_whitespace() -> Generator[T_obj, None, None]:
+        for upright_cluster in cluster_objects(list(chars), upright_key, 0):
+            upright = upright_cluster[0]["upright"]
+            if upright:
+                # Cluster by line
+                subclusters = cluster_objects(
+                    upright_cluster, itemgetter("doctop"), y_tolerance
+                )
+                for sc in subclusters:
+                    sc_cpy = deepcopy(sc)
+                    for idx, c in enumerate(sc_cpy):
+                        if c["text"] == " ":
+                            for next_char in sc_cpy[idx:]:
+                                if next_char["text"] != " ":
+                                    if next_char["x0"] < c["x0"]:
+                                        # this whitespace char is overlapped with the
+                                        # following chars
+                                        sc.remove(c)
+                                        break
+                    yield from sc
+
+    return list(yield_overlapped_whitespace())
diff --git a/tests/pdfs/issue-964-example.pdf b/tests/pdfs/issue-964-example.pdf
diff --git a/tests/test_issues.py b/tests/test_issues.py
@@ -257,3 +257,21 @@ def test_issue_683(self):
         with pdfplumber.open(path) as pdf:
             page = pdf.pages[0]
             page.search(r"\d+", regex=True)
+
+    def test_issue_964(self):
+        """
+        extracted word is broken due to multi overlapped blank chars
+        """
+        path = os.path.join(HERE, "pdfs/issue-964-example.pdf")
+        with pdfplumber.open(path) as pdf:
+            page = pdf.pages[0]
+            lines = page.extract_text().splitlines()
+            assert lines[5].startswith("VLHDU8SHRR H o m e o w ner Discount")
+            lines = (
+                page.remove_whitespace(only_overlapping=True)
+                .extract_text()
+                .splitlines()
+            )
+            assert lines[5].startswith("VLHDU8SHRR Homeowner Discount")
+            lines = page.remove_whitespace().extract_text().splitlines()
+            assert lines[5].startswith("VLHDU8SHRR HomeownerDiscount")