diff --git a/src/bioconverters/utils.py b/src/bioconverters/utils.py
index 18e8d4c..c1ae173 100644
--- a/src/bioconverters/utils.py
+++ b/src/bioconverters/utils.py
@@ -210,6 +210,94 @@ def get_tag_path(mapping: Dict[etree.Element, etree.Element], node: etree.Elemen
return '/'.join((path[::-1]))
+def first_empty_index(items) -> int:
+ """
+ Return the index of the first falsy item in an iterable. Defaults to 0 if no items are falsy
+ """
+ for i, item in enumerate(items):
+ if not item:
+ return i
+ return 0
+
+
+def get_unique_child_element_index(elem: etree.Element, child_elem_type: str) -> int:
+ """
+ Get a child element from an XML parent node and ensure that 1 and exactly 1 element is returned
+
+ Args:
+ elem: the element to search children of
+ child_elem_type: the tag type of the element in question
+ """
+ indices = []
+ for i, child in enumerate(elem):
+ if child.tag == child_elem_type:
+ indices.append(i)
+ if not indices:
+ raise KeyError(f'unable to find child element with tag type = {child_elem_type}')
+ if len(indices) > 1:
+ raise ValueError(f'found multiple child elements with tag type = {child_elem_type}')
+ return indices[0]
+
+
+def normalize_table(elem: etree.Element) -> etree.Element:
+ """
+ Replace any multi-row table header with a single-row header by repeating col-spanning labels as prefixes on their sub-columns
+ """
+ header_elem_index = get_unique_child_element_index(elem, 'thead')
+ header = elem[header_elem_index]
+
+ header_cols = 0
+ header_rows = len(header)
+ for row in header:
+ for header_cell in row:
+ header_cols += int(header_cell.attrib.get('colspan', 1))
+ break
+
+ header_matrix = []
+ filled_cells = []
+ for _ in range(header_rows):
+ row = []
+ for _ in range(header_cols):
+ row.append('')
+ header_matrix.append(row)
+ filled_cells.append([0 for _ in row])
+
+ for i_row, row in enumerate(header):
+ i_col = 0
+ for header_cell in row:
+ text = str(merge_text_chunks(chunk for chunk in tag_handler(header_cell)))
+ row_cells = [r + i_row for r in range(int(header_cell.attrib.get('rowspan', 1)))]
+ col_cells = [
+ r + first_empty_index(filled_cells[i_row])
+ for r in range(int(header_cell.attrib.get('colspan', 1)))
+ ]
+
+ for r in row_cells:
+ for c in col_cells:
+ header_matrix[r][c] = text
+ filled_cells[r][c] = 1
+
+ for col in range(header_cols):
+ for row in range(1, header_rows)[::-1]:
+ if header_matrix[row][col] == header_matrix[row - 1][col]:
+ header_matrix[row][col] = ''
+
+ # now flatten the header rows
+ for row in header_matrix[1:]:
+ for i_col, col in enumerate(row):
+ if col:
+ header_matrix[0][i_col] += ' ' + col
+
+ result = [re.sub(r'[\s\n]+', ' ', col.strip()) for col in header_matrix[0]]
+ new_xml = []
+ for col in result:
+ new_xml.append(f'
{col} | ')
+
+ new_header_elem = etree.fromstring(f'{"".join(new_xml)}
')
+ elem[header_elem_index] = new_header_elem
+ return elem
+
+
def tag_handler(
elem: etree.Element, custom_handlers: Dict[str, TagHandlerFunction] = {}
) -> List[TextChunk]:
@@ -226,6 +314,8 @@ def tag_handler(
return custom_handlers[elem.tag](elem, custom_handlers=custom_handlers)
except NotImplementedError:
pass
+ if elem.tag == 'table':
+ elem = normalize_table(elem)
# Extract any raw text directly in XML element or just after
head = elem.text or ""
tail = elem.tail or ""
diff --git a/tests/data/colspans_table.xml b/tests/data/colspans_table.xml
new file mode 100644
index 0000000..98252c6
--- /dev/null
+++ b/tests/data/colspans_table.xml
@@ -0,0 +1,42 @@
+
+
+
+
+
+ |
+ All patients in NTRK gene fusion-positive efficacy-evaluable population (n=54) |
+
+
+
+
+ Age, years |
+ 58 (48–67) |
+
+
+ Sex |
+
+
+ Female |
+ 32 (59%) |
+
+
+ Male |
+ 22 (41%) |
+
+
+ Race |
+
+
+ White |
+ 43 (80%) |
+
+
+ Asian |
+ 7 (13%) |
+
+
+ Other |
+ 4 (7%) |
+
+
+
diff --git a/tests/data/multi-level-table-header.xml b/tests/data/multi-level-table-header.xml
new file mode 100644
index 0000000..70c00c2
--- /dev/null
+++ b/tests/data/multi-level-table-header.xml
@@ -0,0 +1,150 @@
+
+
+
+
+
+
+ p53
+MUTATION |
+ FUNCTIONALa
+
+STATUS |
+ IARC DATABASEb
+ |
+ FEATURESc
+ |
+
+
+ SOMATIC |
+ GERMLINEFAMILIES |
+
+
+ TOTAL |
+ BREAST |
+
+
+
+
+ T125R |
+ ALTERED |
+ 2 |
+ 1 |
+ 0 |
+ |
+
+
+ L130V |
+ ALTERED |
+ 21 |
+ 3 |
+ 0 |
+ Neo. |
+
+
+ C135F |
+ LOSS |
+ 49 |
+ 3 |
+ 0 |
+ |
+
+
+ C135Y |
+ LOSS |
+ 70 |
+ 11 |
+ 0 |
+ Neo. |
+
+
+ A138V |
+ FUNCTIONAL |
+ 48 |
+ 7 |
+ 0 |
+ |
+
+
+ C176F |
+ LOSS |
+ 181 |
+ 7 |
+ 0 |
+ L2-Zn |
+
+
+ H179R |
+ LOSS |
+ 139 |
+ 16 |
+ 0 |
+ L2-Zn; Neo. |
+
+
+ R181P |
+ LOSS |
+ 22 |
+ 2 |
+ 1 |
+ L2; FH |
+
+
+ S183L |
+ LOSS |
+ 3 |
+ 1 |
+ 0 |
+ L2 |
+
+
+ P190L |
+ ALTERED |
+ 48 |
+ 4 |
+ 0 |
+ L2; BRCA1 |
+
+
+ L194P |
+ ALTERED |
+ 14 |
+ 1 |
+ 0 |
+ L2; BRCA |
+
+
+ L194R |
+ LOSS |
+ 55 |
+ 9 |
+ 0 |
+ L2 |
+
+
+ H214R |
+ ALTERED |
+ 72 |
+ 5 |
+ 0 |
+ BRCA2 |
+
+
+ Y220C |
+ ALTERED |
+ 315 |
+ 41 |
+ 4 |
+ LFS; Neo. |
+
+
+ G245S |
+ LOSS |
+ 396 |
+ 35 |
+ 18 |
+ L3; LFS, LFL, FH; Neo.;
+BRCA1 |
+
+
+
+
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 2c9eac0..e57dd1f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -404,6 +404,17 @@ def test_floating_table():
assert len(table_body[0].split(TABLE_DELIMITER)) == expected_columns * expected_rows
+def test_multilevel_table_header():
+ xml_input = data_file_path('multi-level-table-header.xml')
+ with open(xml_input, 'r') as fh:
+ xml_data = fh.read()
+ chunks = extract_text_chunks([etree.fromstring(xml_data)])
+ table_header = [c.text for c in chunks if c.xml_path.endswith('thead')]
+ assert table_header == [
+ 'p53 MUTATION\tFUNCTIONAL a STATUS\tIARC DATABASE b SOMATIC TOTAL\tIARC DATABASE b SOMATIC BREAST\tIARC DATABASE b GERMLINE FAMILIES\tFEATURES c'
+ ]
+
+
@pytest.mark.parametrize(
'input,output',
[