diff --git a/nvme_lint/extractor.py b/nvme_lint/extractor.py index 5cf0320..795153c 100644 --- a/nvme_lint/extractor.py +++ b/nvme_lint/extractor.py @@ -8,6 +8,8 @@ from . import utils import camelot +import numpy as np +import pandas as pd def extract_tables(file_path, page_height, page_number, content): @@ -16,7 +18,12 @@ def extract_tables(file_path, page_height, page_number, content): tables = {} for caption, table in match_caption_to_table(tables_on_page, page_height, content): if caption != "skip": - tables.update({caption: table}) + table = clean_table(table.df) + + # discard tables with less than two rows + # a table with only headings is irrelevant + if len(table.index) > 1: + tables.update({caption: table}) return page_number, tables @@ -33,6 +40,155 @@ def calc_difference(caption_y, table_y, content_height): return abs((1 - caption_y / content_height) - table_y) +def clean_table(table): + """Fix holes in nested tables, remove empty columns and align subtables.""" + # Replace empty strings with NaN, to make the clean-up easier + table.replace(r"^\s*$", np.nan, regex=True, inplace=True) + + # Drop empty rows and columns + table.dropna(how="all", axis=0, inplace=True) + table.reset_index(drop=True, inplace=True) + + table.dropna(how="all", axis=1, inplace=True) + + subtables = partition_into_subtables(0, table) + + while holes := find_holes(subtables, table): + # Fix holes in the outermost cols first + index = max(holes.keys()) + table = fix_hole(index, holes[index], table) + + # Replace the NaN with empty strings, to make the parsing and transformations easier + table.replace(np.nan, "", inplace=True) + + # Reset column index to remove gaps + table.columns = range(table.shape[1]) + return table + + +def find_holes(subtables, table): + """Find holes between columns of the first row of each subtable. + Return dictionary with kv pairs of {column: subtable}""" + holes = {} + for subtable in subtables: + row = table.iloc[min(subtable)] + if hole := find_hole_in_row(row): + holes[hole] = subtable + + return holes + + +def find_hole_in_row(row): + """Find hole in row. + Return index of column with hole""" + non_nan_indices = set() + for i, value in enumerate(row): + if not pd.isnull(value): + non_nan_indices.add(i) + if hole := set(range(min(non_nan_indices), max(non_nan_indices))) - non_nan_indices: + # If there are multiple empty cols between the values we return the largest index + return max(hole) + + return None + + +def fix_hole(col, rows_with_hole, table): + """Fix hole in column 'col'.""" + # If there are no conflicts between col and col+1 for all rows + if all(pd.isnull(row[col]) or pd.isnull(row[col+1]) + for row in table.itertuples(index=False, name=None)): + + # Merge entire col into col+1 + return pd.concat([table[table.columns[:col]], + table[table.columns[col]].combine_first(table[table.columns[col+1]]), + table[table.columns[col+2:]]], axis=1) + + # If there are no conflicts between col-1 and col for all rows + elif all(pd.isnull(row[col-1]) or pd.isnull(row[col]) + for row in table.itertuples(index=False, name=None)): + + # Merge entire col into col-1 + return pd.concat([table[table.columns[:col-1]], + table[table.columns[col-1]].combine_first(table[table.columns[col]]), + table[table.columns[col+1:]]], axis=1) + + # If it is not possible to merge entire column + else: + # Swap col and col+1 only for rows_with_hole + left = [row[col] for i, row in enumerate(table.itertuples(index=False, name=None)) if i in rows_with_hole] + right = [row[col+1] for i, row in enumerate(table.itertuples(index=False, name=None)) if i in rows_with_hole] + for i, row in enumerate(rows_with_hole): + table.iat[row, col] = right[i] + table.iat[row, col+1] = left[i] + + return table + + +def partition_into_subtables(current_first_row, table): + """Partition tables recursively into subtables. + Return a list of sets containing the indices of the rows they represent""" + + _, n_cols = table.shape + subtables = [] + + current_last_col = 0 + + for col in reversed(range(n_cols)): + if not pd.isnull(table.iat[current_first_row, col]): + current_last_col = col + break + + current_last_row = find_outer_end(current_first_row, table) + + current_subtable = set(range(current_first_row, current_last_row+1)) + + while next_first_row := find_next_subtable(current_first_row, current_last_col, table): + current_first_row = next_first_row + next_subtables = partition_into_subtables(next_first_row, table) + + for subtable in next_subtables: + current_subtable -= subtable + subtables.append(subtable) + current_first_row += len(subtable) + + if current_subtable: + # Don't include empty subtables + subtables.append(current_subtable) + + return subtables + + +def find_outer_end(first_row, table): + """Return index of the final row of the outer shape of the subtable. + This is done by finding the first row where there is content to the LEFT of the subtable""" + n_rows, n_cols = table.shape + + subtable_first_col = 0 + for col in range(n_cols): + if not pd.isnull(table.iat[first_row, col]): + subtable_first_col = col + break + + for i in range(first_row, n_rows): + if any(not pd.isnull(table.iat[i, x]) for x in range(subtable_first_col)): + return i-1 + + # Subtable is in the final row of the table + return n_rows-1 + + +def find_next_subtable(first_row, last_col, table): + """Return index of the first row of the next subtable. + This is done by finding the first row where there is content to the RIGHT of the current subtable. + If there is no such row, return None""" + n_rows, n_cols = table.shape + for i in range(first_row+1, n_rows): + if any(not pd.isnull(table.iat[i, x]) for x in range(last_col+1, n_cols)): + return i + + return None + + def main(file_path, page_height, page_number, content): """Entry point""" global logger diff --git a/nvme_lint/parser.py b/nvme_lint/parser.py index 97e4248..1af2bc3 100644 --- a/nvme_lint/parser.py +++ b/nvme_lint/parser.py @@ -22,7 +22,6 @@ def parse_page(page_number, tables): def parse_table(caption, table): """Parse headings, remove notes and headings from table""" - table = table.df headings = parse_headings(table.head(1).to_numpy()[0]) # Check if the first word in the first row from the bottom is NOTES first_word_of_last_row = table.tail(1).to_numpy()[0][0].split(":")[0] diff --git a/tests/data/aligned.csv b/tests/data/aligned.csv new file mode 100644 index 0000000..884e9f6 --- /dev/null +++ b/tests/data/aligned.csv @@ -0,0 +1,17 @@ +0,1,2,3 +"Bits","Description","","" +"14","Reserved","","" +"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","","" +"","","Value","Command and Feature Identifier List Definition" +"","","00b","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited." +"","","01b","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue." +"","","10b","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint." +"","","11b","Reserved" +"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","","" +"","","Value","Command and Feature Identifier List Contents" +"","","0h","List of Admin Command Set opcodes" +"","","1h","Reserved" +"","","2h","List of Feature Identifiers" +"","","3h","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)" +"","","4h","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)" +"","","5h to Fh","Reserved" diff --git a/tests/data/aligned_3_col.csv b/tests/data/aligned_3_col.csv new file mode 100644 index 0000000..d7b7f61 --- /dev/null +++ b/tests/data/aligned_3_col.csv @@ -0,0 +1,10 @@ +0,1,2,3,4 +"A","1","","","" +"B","2","","","" +"","","C","3","D" +"","","E","4","F" +"","","G","5","H" +"I","6","","","" +"","","J","7","K" +"","","L","8","M" +"","","N","9","O" \ No newline at end of file diff --git a/tests/data/aligned_3_col_with_missing_columns.csv b/tests/data/aligned_3_col_with_missing_columns.csv new file mode 100644 index 0000000..7c00b96 --- /dev/null +++ b/tests/data/aligned_3_col_with_missing_columns.csv @@ -0,0 +1,10 @@ +0,1,2,3,4 +"A","1","","","" +"B","2","","","" +"","","C","3","D" +"","","E","","F" +"","","G","","H" +"I","6","","","" +"","","J","7","K" +"","","L","8","" +"","","M","9","" \ No newline at end of file diff --git a/tests/data/aligned_3_col_with_missing_values.csv b/tests/data/aligned_3_col_with_missing_values.csv new file mode 100644 index 0000000..7d4b496 --- /dev/null +++ b/tests/data/aligned_3_col_with_missing_values.csv @@ -0,0 +1,10 @@ +0,1,2,3,4 +"A","1","","","" +"B","2","","","" +"","","C","3","D" +"","","E","","F" +"","","G","5","H" +"I","6","","","" +"","","J","7","K" +"","","L","8","" +"","","M","9","N" \ No newline at end of file diff --git a/tests/data/aligned_double_nested.csv b/tests/data/aligned_double_nested.csv new file mode 100644 index 0000000..9e27887 --- /dev/null +++ b/tests/data/aligned_double_nested.csv @@ -0,0 +1,23 @@ +0,1,2,3,4,5 +"Bytes","Description","","","","" +"0","","","","","" +"","","Bits","Description","","" +"","","7:6","Reserved","","" +"","","5:4","Contents Selected (CS): This field in combination with the Scope Selected field indicates the contents of the Command and Feature Identifier List field in the log page. The Content Selected field is specified by the contents of the Contents field in the Log Specific Field field of the Get Log Page command.","","" +"","","","","Value","Description" +"","","","","00b","List contains command opcodes or Set Features Feature Identifiers based on the Scope Selected field that are supported to be prohibited" +"","","","","01b","List contains command opcodes or Set Features Feature Identifiers based on the Scope Selected field that are currently prohibited if received on an NVM Express controller submission queue" +"","","","","10b","List contains command opcodes or Set Features Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint" +"","","","","11b","Reserved" +"","","3:0","Scope Selected (SS): This field in combination with the Contents Selected field indicates what the Command and Feature Identifier List field contains in the log page. The Scope Selected field is specified by the contents of the Scope field in the Log Specific field of the Get Log Page command.","","" +"","","","","Value","Description" +"","","","","0h","List contains Admin Command Set opcodes" +"","","","","1h","Reserved" +"","","","","2h","List contains Set Features Feature Identifiers" +"","","","","3h","List contains Management Interface Command Set opcodes" +"","","","","4h","List contains PCIe Command Set opcodes" +"","","","","5h to Fh","Reserved" +"2:1","Reserved","","","","" +"3","Length (LNGTH): This field indicates the length in bytes (n) of the Command and Feature Identifier List field that follow in the log page. If the Command and Feature Identifier List field contains no coded values, then this field shall be cleared to 0h.","","","","" +"n+3:4","Command and Feature Identifier List (CFIL): The contents of this field are dependent on the setting of the Contents Selected field and Scope Selected field. This field contains a list of coded values identified by the Scope Selected field and the Content Selected field. The list shall be in order from lowest numerical value to highest numerical value.","","","","" +"511:n+4","Reserved","","","","" diff --git a/tests/data/aligned_mixed_col.csv b/tests/data/aligned_mixed_col.csv new file mode 100644 index 0000000..15ebd07 --- /dev/null +++ b/tests/data/aligned_mixed_col.csv @@ -0,0 +1,10 @@ +0,1,2,3,4 +"A","1","","","" +"B","2","","","" +"","","C","D","" +"","","E","F","" +"","","G","H","" +"I","6","","","" +"","","J","7","K" +"","","L","8","M" +"","","N","9","O" diff --git a/tests/data/aligned_no_nesting.csv b/tests/data/aligned_no_nesting.csv new file mode 100644 index 0000000..11880aa --- /dev/null +++ b/tests/data/aligned_no_nesting.csv @@ -0,0 +1,10 @@ +0,1 +"A","1" +"B","2" +"C","3" +"D","4" +"E","5" +"F","6" +"G","7" +"H","8" +"I","9" diff --git a/tests/data/aligned_with_missing_values.csv b/tests/data/aligned_with_missing_values.csv new file mode 100644 index 0000000..201620c --- /dev/null +++ b/tests/data/aligned_with_missing_values.csv @@ -0,0 +1,17 @@ +0,1,2,3 +"Bits","Description","","" +"14","Reserved","","" +"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","","" +"","","Value","Command and Feature Identifier List Definition" +"","","00b","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited." +"","","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue." +"","","10b","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint." +"","","11b","Reserved" +"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","","" +"","","Value","Command and Feature Identifier List Contents" +"","","0h","List of Admin Command Set opcodes" +"","","1h","Reserved" +"","","","List of Feature Identifiers" +"","","3h","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)" +"","","4h","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)" +"","","5h to Fh","Reserved" diff --git a/tests/data/misaligned_1.csv b/tests/data/misaligned_1.csv new file mode 100644 index 0000000..b9f4715 --- /dev/null +++ b/tests/data/misaligned_1.csv @@ -0,0 +1,17 @@ +0,1,2,3,4,5,6,7 +"Bits","Description","","","","","","" +"14","Reserved","","","","","","" +"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","","" +"","","","Value","","Command and Feature Identifier List Definition","","" +"","","","00b","","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited.","","" +"","","","01b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue.","","" +"","","","10b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint.","","" +"","","","11b","","Reserved","","" +"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","","" +"","","Value","","Command and Feature Identifier List Contents","","","" +"","","0h","","List of Admin Command Set opcodes","","","" +"","","1h","","Reserved","","","" +"","","2h","","List of Feature Identifiers","","","" +"","","3h","","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)","","","" +"","","4h","","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)","","","" +"","","5h to Fh","","Reserved","","","" diff --git a/tests/data/misaligned_2.csv b/tests/data/misaligned_2.csv new file mode 100644 index 0000000..3b635d1 --- /dev/null +++ b/tests/data/misaligned_2.csv @@ -0,0 +1,17 @@ +0,1,2,3,4,5,6,7 +"Bits","Description","","","","","","" +"14","Reserved","","","","","","" +"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","","" +"","","Value","","Command and Feature Identifier List Definition","","","" +"","","00b","","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited.","","","" +"","","01b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue.","","","" +"","","10b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint.","","","" +"","","11b","","Reserved","","","" +"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","","" +"","","","Value","","Command and Feature Identifier List Contents","","" +"","","","0h","","List of Admin Command Set opcodes","","" +"","","","1h","","Reserved","","" +"","","","2h","","List of Feature Identifiers","","" +"","","","3h","","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)","","" +"","","","4h","","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)","","" +"","","","5h to Fh","","Reserved","","" diff --git a/tests/data/misaligned_3_col.csv b/tests/data/misaligned_3_col.csv new file mode 100644 index 0000000..254a121 --- /dev/null +++ b/tests/data/misaligned_3_col.csv @@ -0,0 +1,10 @@ +0,1,2,3,4,5,6 +"A","1","","","","" +"B","2","","","","" +"","","C","","3","D" +"","","E","","4","F" +"","","G","","5","H" +"I","6","","","","" +"","","J","7","","K" +"","","L","8","","M" +"","","N","9","","O" \ No newline at end of file diff --git a/tests/data/misaligned_3_col_with_missing_columns.csv b/tests/data/misaligned_3_col_with_missing_columns.csv new file mode 100644 index 0000000..7304203 --- /dev/null +++ b/tests/data/misaligned_3_col_with_missing_columns.csv @@ -0,0 +1,10 @@ +0,1,2,3,4,5,6 +"A","1","","","","" +"B","2","","","","" +"","","C","","3","D" +"","","E","","","F" +"","","G","","","H" +"I","6","","","","" +"","","J","7","","K" +"","","L","8","","" +"","","M","9","","" \ No newline at end of file diff --git a/tests/data/misaligned_3_col_with_missing_values.csv b/tests/data/misaligned_3_col_with_missing_values.csv new file mode 100644 index 0000000..5a1237f --- /dev/null +++ b/tests/data/misaligned_3_col_with_missing_values.csv @@ -0,0 +1,10 @@ +0,1,2,3,4,5,6 +"A","1","","","","" +"B","2","","","","" +"","","C","","3","D" +"","","E","","","F" +"","","G","","5","H" +"I","6","","","","" +"","","J","7","","K" +"","","L","8","","" +"","","M","9","","N" \ No newline at end of file diff --git a/tests/data/misaligned_double_nested.csv b/tests/data/misaligned_double_nested.csv new file mode 100644 index 0000000..78d2cf2 --- /dev/null +++ b/tests/data/misaligned_double_nested.csv @@ -0,0 +1,24 @@ + +"0","1","2","3","4","5","6","7","8","9" +"Bytes","Description","","","","","","","","" +"0","","","","","","","","","" +"","","Bits","Description","","","","","","" +"","","7:6","Reserved","","","","","","" +"","","5:4","Contents Selected (CS): This field in combination with the Scope Selected field indicates the contents of the Command and Feature Identifier List field in the log page. The Content Selected field is specified by the contents of the Contents field in the Log Specific Field field of the Get Log Page command.","","","","","","" +"","","","","","Value","Description","","","" +"","","","","","00b","List contains command opcodes or Set Features Feature Identifiers based on the Scope Selected field that are supported to be prohibited","","","" +"","","","","","01b","List contains command opcodes or Set Features Feature Identifiers based on the Scope Selected field that are currently prohibited if received on an NVM Express controller submission queue","","","" +"","","","","","10b","List contains command opcodes or Set Features Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint","","","" +"","","","","","11b","Reserved","","","" +"","","3:0","Scope Selected (SS): This field in combination with the Contents Selected field indicates what the Command and Feature Identifier List field contains in the log page. The Scope Selected field is specified by the contents of the Scope field in the Log Specific field of the Get Log Page command.","","","","","","" +"","","","","Value","","Description","","","" +"","","","","0h","","List contains Admin Command Set opcodes","","","" +"","","","","1h","","Reserved","","","" +"","","","","2h","","List contains Set Features Feature Identifiers","","","" +"","","","","3h","","List contains Management Interface Command Set opcodes","","","" +"","","","","4h","","List contains PCIe Command Set opcodes","","","" +"","","","","5h to Fh","","Reserved","","","" +"2:1","Reserved","","","","","","","","" +"3","Length (LNGTH): This field indicates the length in bytes (n) of the Command and Feature Identifier List field that follow in the log page. If the Command and Feature Identifier List field contains no coded values, then this field shall be cleared to 0h.","","","","","","","","" +"n+3:4","Command and Feature Identifier List (CFIL): The contents of this field are dependent on the setting of the Contents Selected field and Scope Selected field. This field contains a list of coded values identified by the Scope Selected field and the Content Selected field. The list shall be in order from lowest numerical value to highest numerical value.","","","","","","","","" +"511:n+4","Reserved","","","","","","","","" diff --git a/tests/data/misaligned_mixed_col_1.csv b/tests/data/misaligned_mixed_col_1.csv new file mode 100644 index 0000000..38f1b6b --- /dev/null +++ b/tests/data/misaligned_mixed_col_1.csv @@ -0,0 +1,10 @@ +0,1,2,3,4,5 +"A","1","","","","" +"B","2","","","","" +"","","C","","","D" +"","","E","","","F" +"","","G","","","H" +"I","6","","","","" +"","","J","7","","K" +"","","L","8","","M" +"","","N","9","","O" diff --git a/tests/data/misaligned_mixed_col_2.csv b/tests/data/misaligned_mixed_col_2.csv new file mode 100644 index 0000000..bbafffe --- /dev/null +++ b/tests/data/misaligned_mixed_col_2.csv @@ -0,0 +1,10 @@ +0,1,2,3,4,5 +"A","1","","","","" +"B","2","","","","" +"","","C","","D","" +"","","E","","F","" +"","","G","","H","" +"I","6","","","","" +"","","J","7","","K" +"","","L","8","","M" +"","","N","9","","O" \ No newline at end of file diff --git a/tests/data/misaligned_mixed_col_3.csv b/tests/data/misaligned_mixed_col_3.csv new file mode 100644 index 0000000..b83d10a --- /dev/null +++ b/tests/data/misaligned_mixed_col_3.csv @@ -0,0 +1,10 @@ +0,1,2,3,4,5 +"A","1","","","","" +"B","2","","","","" +"","","C","D","","" +"","","E","F","","" +"","","G","H","","" +"I","6","","","","" +"","","J","7","","K" +"","","L","8","","M" +"","","N","9","","O" \ No newline at end of file diff --git a/tests/data/misaligned_with_missing_values.csv b/tests/data/misaligned_with_missing_values.csv new file mode 100644 index 0000000..b42333d --- /dev/null +++ b/tests/data/misaligned_with_missing_values.csv @@ -0,0 +1,17 @@ +0,1,2,3,4,5,6,7 +"Bits","Description","","","","","","" +"14","Reserved","","","","","","" +"13:12","Contents (CNTTS): This field in combination with the Scope field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","","" +"","","","Value","","Command and Feature Identifier List Definition","","" +"","","","00b","","List of command opcodes or Feature Identifiers based on the Scope field that are supported to be prohibited.","","" +"","","","","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received on an NVM Express controller Admin submission queue.","","" +"","","","10b","","List of command opcodes or Feature Identifiers based on the Scope field that are currently prohibited if received out-of-band on a Management Endpoint.","","" +"","","","11b","","Reserved","","" +"11:08","Scope (SCP): This field in combination with the Contents field specifies the contents of the Command and Feature Identifier List field in the log page.","","","","","","" +"","","Value","","Command and Feature Identifier List Contents","","","" +"","","0h","","List of Admin Command Set opcodes","","","" +"","","1h","","Reserved","","","" +"","","","","List of Feature Identifiers","","","" +"","","3h","","List of a Management Interface Command Set opcodes (refer to the NVM Express Management Interface Specification)","","","" +"","","4h","","List of a PCIe Command Set opcodes (refer to the NVM Express Management Interface Specification)","","","" +"","","5h to Fh","","Reserved","","","" diff --git a/tests/test_extractor.py b/tests/test_extractor.py new file mode 100644 index 0000000..71c1938 --- /dev/null +++ b/tests/test_extractor.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Copyright (c) 2022 Samsung Electronics Co., Ltd +SPDX-License-Identifier: GPLv2-or-later or Apache-2.0 +""" +import pytest +import numpy as np +import pandas as pd + +from nvme_lint import extractor +from nvme_lint import utils + + +@pytest.mark.parametrize("before, after", [("misaligned_1", "aligned"), + ("misaligned_2", "aligned"), + ("aligned", "aligned"), + ("misaligned_with_missing_values", "aligned_with_missing_values"), + ("aligned_with_missing_values", "aligned_with_missing_values"), + ("misaligned_3_col", "aligned_3_col"), + ("aligned_3_col", "aligned_3_col"), + ("aligned_mixed_col", "aligned_mixed_col"), + ("misaligned_mixed_col_1", "aligned_mixed_col"), + ("misaligned_mixed_col_2", "aligned_mixed_col"), + ("misaligned_mixed_col_3", "aligned_mixed_col"), + ("misaligned_3_col_with_missing_values", "aligned_3_col_with_missing_values"), + ("aligned_3_col_with_missing_values", "aligned_3_col_with_missing_values"), + ("misaligned_3_col_with_missing_columns", "aligned_3_col_with_missing_columns"), + ("aligned_3_col_with_missing_columns", "aligned_3_col_with_missing_columns"), + ("misaligned_double_nested", "aligned_double_nested"), + ("aligned_double_nested", "aligned_double_nested"), + ("aligned_no_nesting", "aligned_no_nesting")]) +def test_clean_table(before, after): + prefix = "tests/data/" + postfix = ".csv" + before = utils.expand_path(prefix+before+postfix) + after = utils.expand_path(prefix+after+postfix) + try: + before_df = pd.read_csv(before) + after_df = pd.read_csv(after) + + except FileNotFoundError: + pytest.fail("tests must be run from 'nvme-lint/'") + + # Replace NaN with the empty string to mimic the actual tables + before_df.replace(np.nan, "", inplace=True) + after_df.replace(np.nan, "", inplace=True) + + clean = extractor.clean_table(before_df) + + assert compare_dataframes(clean, after_df) + + +def compare_dataframes(left, right): + """This function is a replacement of '.equals()', + this is necessary because '.equals()' is too strict with the typing for these tests. + This function attempts a comparison of the values as floats, + if the conversion fails then the comparison is made as strings""" + if left.shape != right.shape: + return False + + rows, cols = left.shape + for i in range(rows): + for j in range(cols): + try: + if float(left.iloc[i-1, j-1]) != float(right.iloc[i-1, j-1]): + return False + except ValueError: + if str(left.iloc[i-1, j-1]) != str(right.iloc[i-1, j-1]): + return False + return True