diff --git a/README.md b/README.md index 7d7ab423..cfe5b441 100644 --- a/README.md +++ b/README.md @@ -277,7 +277,7 @@ If you're using `pdfplumber` on a Debian-based system and encounter a `PolicyErr | Method | Description | |--------|-------------| -|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.| +|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3, strip_whitespaces=True)` method.| |`.extract_tables(table_settings={})`|Returns the text extracted from *all* tables found on the page, represented as a list of lists of lists, with the structure `table -> row -> cell`.| |`.extract_table(table_settings={})`|Returns the text extracted from the *largest* table on the page, represented as a list of lists, with the structure `row -> cell`. (If multiple tables have the same size — as measured by the number of cells — this method returns the table closest to the top of the page.)| |`.debug_tablefinder(table_settings={})`|Returns an instance of the `TableFinder` class, with access to the `.edges`, `.intersections`, `.cells`, and `.tables` properties.| @@ -314,6 +314,7 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r "intersection_tolerance": 3, "intersection_x_tolerance": None, "intersection_y_tolerance": None, + "strip_whitespaces": True, } ``` @@ -331,6 +332,7 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r |`"keep_blank_chars"`| When using the `text` strategy, consider `" "` chars to be *parts* of words and not word-separators.| |`"text_tolerance"`, `"text_x_tolerance"`, `"text_y_tolerance"`| When the `text` strategy searches for words, it will expect the individual letters in each word to be no more than `text_tolerance` pixels apart.| |`"intersection_tolerance"`, `"intersection_x_tolerance"`, `"intersection_y_tolerance"`| When combining edges into cells, orthogonal edges must be within `intersection_tolerance` pixels to be considered intersecting.| +|`"strip_whitespaces"`| When extracting text of tables, strip whitespaces from each cell.| ### Table-extraction strategies diff --git a/pdfplumber/page.py b/pdfplumber/page.py index ac94610c..bf301048 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -224,6 +224,10 @@ def extract_tables(self, table_settings={}): for k in ["x_tolerance", "y_tolerance"] if "text_" + k in table_settings ) + if "strip_whitespaces" in table_settings: + extract_kwargs.update( + {"strip_whitespaces": table_settings["strip_whitespaces"]} + ) return [table.extract(**extract_kwargs) for table in tables] diff --git a/pdfplumber/table.py b/pdfplumber/table.py index 9101636f..c16e2aaf 100644 --- a/pdfplumber/table.py +++ b/pdfplumber/table.py @@ -358,6 +358,7 @@ def extract( self, x_tolerance=utils.DEFAULT_X_TOLERANCE, y_tolerance=utils.DEFAULT_Y_TOLERANCE, + strip_whitespaces=True ): chars = self.page.chars @@ -386,7 +387,9 @@ def char_in_bbox(char, bbox): if len(cell_chars): cell_text = utils.extract_text( cell_chars, x_tolerance=x_tolerance, y_tolerance=y_tolerance - ).strip() + ) + if strip_whitespaces: + cell_text = cell_text.strip() else: cell_text = "" arr.append(cell_text) @@ -413,6 +416,7 @@ def char_in_bbox(char, bbox): "intersection_tolerance": 3, "intersection_x_tolerance": None, "intersection_y_tolerance": None, + "strip_whitespaces": True, }