Skip to content

Commit

Permalink
extractor: fix alignment of nested tables
Browse files Browse the repository at this point in the history
This commit fixes issues caused when camelot parses a table with nested
tables that aren't perfectly aligned. This creates a table like this:

x x - - - -
- - x - x -
x x - - - -
- - - x - x

The code transforms the above table to look like this instead:

x x - -
- - x x
x x - -
- - x x

This makes sure that no data is lost when parsing.

An example of a table in the spec that previously caused these issues is
Figure 259 from Base Specification 2.0b

Signed-off-by: Karl Bonde Torp <[email protected]>
  • Loading branch information
karlowich committed Nov 10, 2022
1 parent 28522a7 commit 20b7b13
Show file tree
Hide file tree
Showing 21 changed files with 469 additions and 2 deletions.
158 changes: 157 additions & 1 deletion nvme_lint/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from . import utils
import camelot
import numpy as np
import pandas as pd


def extract_tables(file_path, page_height, page_number, content):
Expand All @@ -16,7 +18,12 @@ def extract_tables(file_path, page_height, page_number, content):
tables = {}
for caption, table in match_caption_to_table(tables_on_page, page_height, content):
if caption != "skip":
tables.update({caption: table})
table = clean_table(table.df)

# discard tables with less than two rows
# a table with only headings is irrelevant
if len(table.index) > 1:
tables.update({caption: table})
return page_number, tables


Expand All @@ -33,6 +40,155 @@ def calc_difference(caption_y, table_y, content_height):
return abs((1 - caption_y / content_height) - table_y)


def clean_table(table):
"""Fix holes in nested tables, remove empty columns and align subtables."""
# Replace empty strings with NaN, to make the clean-up easier
table.replace(r"^\s*$", np.nan, regex=True, inplace=True)

# Drop empty rows and columns
table.dropna(how="all", axis=0, inplace=True)
table.reset_index(drop=True, inplace=True)

table.dropna(how="all", axis=1, inplace=True)

subtables = partition_into_subtables(0, table)

while holes := find_holes(subtables, table):
# Fix holes in the outermost cols first
index = max(holes.keys())
table = fix_hole(index, holes[index], table)

# Replace the NaN with empty strings, to make the parsing and transformations easier
table.replace(np.nan, "", inplace=True)

# Reset column index to remove gaps
table.columns = range(table.shape[1])
return table


def find_holes(subtables, table):
"""Find holes between columns of the first row of each subtable.
Return dictionary with kv pairs of {column: subtable}"""
holes = {}
for subtable in subtables:
row = table.iloc[min(subtable)]
if hole := find_hole_in_row(row):
holes[hole] = subtable

return holes


def find_hole_in_row(row):
"""Find hole in row.
Return index of column with hole"""
non_nan_indices = set()
for i, value in enumerate(row):
if not pd.isnull(value):
non_nan_indices.add(i)
if hole := set(range(min(non_nan_indices), max(non_nan_indices))) - non_nan_indices:
# If there are multiple empty cols between the values we return the largest index
return max(hole)

return None


def fix_hole(col, rows_with_hole, table):
"""Fix hole in column 'col'."""
# If there are no conflicts between col and col+1 for all rows
if all(pd.isnull(row[col]) or pd.isnull(row[col+1])
for row in table.itertuples(index=False, name=None)):

# Merge entire col into col+1
return pd.concat([table[table.columns[:col]],
table[table.columns[col]].combine_first(table[table.columns[col+1]]),
table[table.columns[col+2:]]], axis=1)

# If there are no conflicts between col-1 and col for all rows
elif all(pd.isnull(row[col-1]) or pd.isnull(row[col])
for row in table.itertuples(index=False, name=None)):

# Merge entire col into col-1
return pd.concat([table[table.columns[:col-1]],
table[table.columns[col-1]].combine_first(table[table.columns[col]]),
table[table.columns[col+1:]]], axis=1)

# If it is not possible to merge entire column
else:
# Swap col and col+1 only for rows_with_hole
left = [row[col] for i, row in enumerate(table.itertuples(index=False, name=None)) if i in rows_with_hole]
right = [row[col+1] for i, row in enumerate(table.itertuples(index=False, name=None)) if i in rows_with_hole]
for i, row in enumerate(rows_with_hole):
table.iat[row, col] = right[i]
table.iat[row, col+1] = left[i]

return table


def partition_into_subtables(current_first_row, table):
"""Partition tables recursively into subtables.
Return a list of sets containing the indices of the rows they represent"""

_, n_cols = table.shape
subtables = []

current_last_col = 0

for col in reversed(range(n_cols)):
if not pd.isnull(table.iat[current_first_row, col]):
current_last_col = col
break

current_last_row = find_outer_end(current_first_row, table)

current_subtable = set(range(current_first_row, current_last_row+1))

while next_first_row := find_next_subtable(current_first_row, current_last_col, table):
current_first_row = next_first_row
next_subtables = partition_into_subtables(next_first_row, table)

for subtable in next_subtables:
current_subtable -= subtable
subtables.append(subtable)
current_first_row += len(subtable)

if current_subtable:
# Don't include empty subtables
subtables.append(current_subtable)

return subtables


def find_outer_end(first_row, table):
"""Return index of the final row of the outer shape of the subtable.
This is done by finding the first row where there is content to the LEFT of the subtable"""
n_rows, n_cols = table.shape

subtable_first_col = 0
for col in range(n_cols):
if not pd.isnull(table.iat[first_row, col]):
subtable_first_col = col
break

for i in range(first_row, n_rows):
if any(not pd.isnull(table.iat[i, x]) for x in range(subtable_first_col)):
return i-1

# Subtable is in the final row of the table
return n_rows-1


def find_next_subtable(first_row, last_col, table):
"""Return index of the first row of the next subtable.
This is done by finding the first row where there is content to the RIGHT of the current subtable.
If there is no such row, return None"""
n_rows, n_cols = table.shape
for i in range(first_row+1, n_rows):
if any(not pd.isnull(table.iat[i, x]) for x in range(last_col+1, n_cols)):
return i

return None


def main(file_path, page_height, page_number, content):
"""Entry point"""
global logger
Expand Down
1 change: 0 additions & 1 deletion nvme_lint/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def parse_page(page_number, tables):

def parse_table(caption, table):
"""Parse headings, remove notes and headings from table"""
table = table.df
headings = parse_headings(table.head(1).to_numpy()[0])
# Check if the first word in the first row from the bottom is NOTES
first_word_of_last_row = table.tail(1).to_numpy()[0][0].split(":")[0]
Expand Down
17 changes: 17 additions & 0 deletions tests/data/aligned.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
0,1,2,3
"Bits","Description","",""
"14","Reserved","",""
"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","",""
"","","Value","Command and Feature Identifier List Definition"
"","","00b","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited."
"","","01b","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue."
"","","10b","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint."
"","","11b","Reserved"
"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","",""
"","","Value","Command and Feature Identifier List Contents"
"","","0h","List of Admin Command Set opcodes"
"","","1h","Reserved"
"","","2h","List of Feature Identifiers"
"","","3h","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)"
"","","4h","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)"
"","","5h to Fh","Reserved"
10 changes: 10 additions & 0 deletions tests/data/aligned_3_col.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1,2,3,4
"A","1","","",""
"B","2","","",""
"","","C","3","D"
"","","E","4","F"
"","","G","5","H"
"I","6","","",""
"","","J","7","K"
"","","L","8","M"
"","","N","9","O"
10 changes: 10 additions & 0 deletions tests/data/aligned_3_col_with_missing_columns.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1,2,3,4
"A","1","","",""
"B","2","","",""
"","","C","3","D"
"","","E","","F"
"","","G","","H"
"I","6","","",""
"","","J","7","K"
"","","L","8",""
"","","M","9",""
10 changes: 10 additions & 0 deletions tests/data/aligned_3_col_with_missing_values.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1,2,3,4
"A","1","","",""
"B","2","","",""
"","","C","3","D"
"","","E","","F"
"","","G","5","H"
"I","6","","",""
"","","J","7","K"
"","","L","8",""
"","","M","9","N"
23 changes: 23 additions & 0 deletions tests/data/aligned_double_nested.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
0,1,2,3,4,5
"Bytes","Description","","","",""
"0","","","","",""
"","","Bits","Description","",""
"","","7:6","Reserved","",""
"","","5:4","Contents Selected (CS): This field in combination with the Scope Selected field indicates the contents of the Command and Feature Identifier List field in the log page. The Content Selected field is specified by the contents of the Contents field in the Log Specific Field field of the Get Log Page command.","",""
"","","","","Value","Description"
"","","","","00b","List contains command opcodes or Set Features Feature Identifiers based on the Scope Selected field that are supported to be prohibited"
"","","","","01b","List contains command opcodes or Set Features Feature Identifiers based on the Scope Selected field that are currently prohibited if received on an NVM Express controller submission queue"
"","","","","10b","List contains command opcodes or Set Features Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint"
"","","","","11b","Reserved"
"","","3:0","Scope Selected (SS): This field in combination with the Contents Selected field indicates what the Command and Feature Identifier List field contains in the log page. The Scope Selected field is specified by the contents of the Scope field in the Log Specific field of the Get Log Page command.","",""
"","","","","Value","Description"
"","","","","0h","List contains Admin Command Set opcodes"
"","","","","1h","Reserved"
"","","","","2h","List contains Set Features Feature Identifiers"
"","","","","3h","List contains Management Interface Command Set opcodes"
"","","","","4h","List contains PCIe Command Set opcodes"
"","","","","5h to Fh","Reserved"
"2:1","Reserved","","","",""
"3","Length (LNGTH): This field indicates the length in bytes (n) of the Command and Feature Identifier List field that follow in the log page. If the Command and Feature Identifier List field contains no coded values, then this field shall be cleared to 0h.","","","",""
"n+3:4","Command and Feature Identifier List (CFIL): The contents of this field are dependent on the setting of the Contents Selected field and Scope Selected field. This field contains a list of coded values identified by the Scope Selected field and the Content Selected field. The list shall be in order from lowest numerical value to highest numerical value.","","","",""
"511:n+4","Reserved","","","",""
10 changes: 10 additions & 0 deletions tests/data/aligned_mixed_col.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1,2,3,4
"A","1","","",""
"B","2","","",""
"","","C","D",""
"","","E","F",""
"","","G","H",""
"I","6","","",""
"","","J","7","K"
"","","L","8","M"
"","","N","9","O"
10 changes: 10 additions & 0 deletions tests/data/aligned_no_nesting.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1
"A","1"
"B","2"
"C","3"
"D","4"
"E","5"
"F","6"
"G","7"
"H","8"
"I","9"
17 changes: 17 additions & 0 deletions tests/data/aligned_with_missing_values.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
0,1,2,3
"Bits","Description","",""
"14","Reserved","",""
"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","",""
"","","Value","Command and Feature Identifier List Definition"
"","","00b","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited."
"","","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue."
"","","10b","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint."
"","","11b","Reserved"
"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","",""
"","","Value","Command and Feature Identifier List Contents"
"","","0h","List of Admin Command Set opcodes"
"","","1h","Reserved"
"","","","List of Feature Identifiers"
"","","3h","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)"
"","","4h","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)"
"","","5h to Fh","Reserved"
17 changes: 17 additions & 0 deletions tests/data/misaligned_1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
0,1,2,3,4,5,6,7
"Bits","Description","","","","","",""
"14","Reserved","","","","","",""
"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","",""
"","","","Value","","Command and Feature Identifier List Definition","",""
"","","","00b","","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited.","",""
"","","","01b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue.","",""
"","","","10b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint.","",""
"","","","11b","","Reserved","",""
"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","",""
"","","Value","","Command and Feature Identifier List Contents","","",""
"","","0h","","List of Admin Command Set opcodes","","",""
"","","1h","","Reserved","","",""
"","","2h","","List of Feature Identifiers","","",""
"","","3h","","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)","","",""
"","","4h","","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)","","",""
"","","5h to Fh","","Reserved","","",""
17 changes: 17 additions & 0 deletions tests/data/misaligned_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
0,1,2,3,4,5,6,7
"Bits","Description","","","","","",""
"14","Reserved","","","","","",""
"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","",""
"","","Value","","Command and Feature Identifier List Definition","","",""
"","","00b","","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited.","","",""
"","","01b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue.","","",""
"","","10b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint.","","",""
"","","11b","","Reserved","","",""
"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","",""
"","","","Value","","Command and Feature Identifier List Contents","",""
"","","","0h","","List of Admin Command Set opcodes","",""
"","","","1h","","Reserved","",""
"","","","2h","","List of Feature Identifiers","",""
"","","","3h","","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)","",""
"","","","4h","","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)","",""
"","","","5h to Fh","","Reserved","",""
10 changes: 10 additions & 0 deletions tests/data/misaligned_3_col.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1,2,3,4,5,6
"A","1","","","",""
"B","2","","","",""
"","","C","","3","D"
"","","E","","4","F"
"","","G","","5","H"
"I","6","","","",""
"","","J","7","","K"
"","","L","8","","M"
"","","N","9","","O"
10 changes: 10 additions & 0 deletions tests/data/misaligned_3_col_with_missing_columns.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1,2,3,4,5,6
"A","1","","","",""
"B","2","","","",""
"","","C","","3","D"
"","","E","","","F"
"","","G","","","H"
"I","6","","","",""
"","","J","7","","K"
"","","L","8","",""
"","","M","9","",""
10 changes: 10 additions & 0 deletions tests/data/misaligned_3_col_with_missing_values.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,1,2,3,4,5,6
"A","1","","","",""
"B","2","","","",""
"","","C","","3","D"
"","","E","","","F"
"","","G","","5","H"
"I","6","","","",""
"","","J","7","","K"
"","","L","8","",""
"","","M","9","","N"
Loading

0 comments on commit 20b7b13

Please sign in to comment.