Skip to content

Commit

Permalink
fix: less transformations on comment lines and others (#523)
Browse files Browse the repository at this point in the history
Various fixes:
- avoid changing blank and comments lines
- fix space formatting in parents in unparse
- fix properties order in unparse
  • Loading branch information
alexgarel authored Jul 24, 2024
1 parent 0863d52 commit d366218
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 108 deletions.
2 changes: 1 addition & 1 deletion parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ def _create_child_links(self, child_links: list[ChildLink], project_label: str):
MATCH (p:{project_label}) USING INDEX p:{project_label}(id)
WHERE p.id = child_link.parent_id
MATCH (c:{project_label}) USING INDEX c:{project_label}(id)
WHERE c.id = child_link.id
"""
+ """
WHERE c.id = child_link.id
CREATE (c)-[relations:is_child_of {position: child_link.position}]->(p)
WITH relations
UNWIND relations AS relation
Expand Down
154 changes: 67 additions & 87 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import collections
import copy
import logging
import re
import sys
Expand Down Expand Up @@ -31,11 +30,6 @@ class NodeData:
properties: dict[str, str] = field(default_factory=dict)
tags: dict[str, list[str]] = field(default_factory=dict)
comments: dict[str, list[str]] = field(default_factory=dict)
# comments_stack is a list of tuples (line_number, comment),
# to keep track of comments just above the current line
# during parsing of an entry, to be able to add them
# to the right property or tag when possible
comments_stack: list[tuple[int, str]] = field(default_factory=list)
is_external: bool = False # True if the node comes from another taxonomy
original_taxonomy: str | None = None # the name of the taxonomy the node comes from

Expand All @@ -62,6 +56,10 @@ def get_node_type(self):
else:
return NodeType.ENTRY

@property
def is_started(self):
return self.id or self.parent_tags or self.tags or self.properties


class PreviousLink(TypedDict):
before_id: str
Expand Down Expand Up @@ -96,16 +94,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
for line_number, line in enumerate(file):
if line_number < start:
continue
# sanitizing
# remove any space characters at end of line
line = line.rstrip()
# replace commas between digits
# and that have no space around by a lower comma character
# and do the same for escaped comma (preceded by a \)
# (to distinguish them from commas acting as tags separators)
line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
line = re.sub(r"\\,", "\\‚", line)
yield line_number, line
yield line_number, line.rstrip("\n")
line_count += 1
yield line_count, "" # to end the last entry if not ended

Expand All @@ -114,13 +103,30 @@ def _normalize_entry_id(self, raw_id: str) -> str:
Get a normalized string but keeping the language code "lc:",
used for id and parent tag
"""
raw_id = raw_id.strip()
lc, main_tag = raw_id.split(":", 1)
normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords)
normalized_id = f"{lc}:{normalized_main_tag}"
return normalized_id

def prepare_line(self, line: str) -> str:
"""prepare line for parsing
This is different from `normalize_text` which is to compute ids
"""
line = line.strip()
# sanitizing
# remove any space or commas characters at end of line
line = re.sub(r"[\s,]+$", "", line)
# replace commas between digits and that have no space around by a lower comma character
# and do the same for escaped comma (preceded by a \)
# (to distinguish them from commas acting as tags separators)
line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
line = re.sub(r"\\,", "\\‚", line)
return line

def undo_normalize_text(self, text: str) -> str:
"""Undo some normalizations made in `_file_iter`"""
"""Undo some normalizations made in `prepare_line`"""
# restore commas from lower comma characters
text = re.sub(r"(\d)‚(\d)", r"\1,\2", text)
text = re.sub(r"\\‚", "\\,", text)
Expand Down Expand Up @@ -149,7 +155,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
h = 0
header: list[str] = []
for _, line in self._file_iter(filename):
if not (line) or line[0] == "#":
if not (line.strip()) or line[0] == "#":
header.append(line)
else:
break
Expand All @@ -158,7 +164,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
# we don't want to eat the comments of the next block
# and it removes the last separating line
for i in range(len(header)):
if header.pop():
if header.pop().strip():
h -= 1
else:
break
Expand All @@ -170,7 +176,7 @@ def _entry_end(self, line: str, data: NodeData) -> bool:
if data.id.startswith("stopwords") or data.id.startswith("synonyms"):
# stopwords and synonyms are one-liners; if the id is set, it's the end
return True
if not line and data.id:
if not line.strip() and data.id:
# entries are separated by a blank line
return True
return False
Expand All @@ -182,7 +188,7 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
"""
is_before = data.is_before
# first, check if there is at least one preceding line
if data.preceding_lines and not data.preceding_lines[0]:
if data.preceding_lines and not data.preceding_lines[0].strip():
if data.id.startswith("synonyms"):
# it's a synonyms block,
# if the previous block is a stopwords block,
Expand All @@ -202,42 +208,13 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
data.preceding_lines.pop(0)
return data

def _get_node_data_with_comments_above_key(
self, data: NodeData, line_number: int, key: str
) -> NodeData:
def _add_comments(self, data: NodeData, comments: list[str], key: str) -> NodeData:
"""Returns the updated node data with comments above the given
key stored in the {key}_comments property."""
new_data = copy.deepcopy(data)

# Get comments just above the given line
comments_above = []
current_line = line_number - 1
while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
comments_above.append(new_data.comments_stack.pop()[1])
current_line -= 1
if comments_above:
new_data.comments[key + "_comments"] = comments_above[::-1]

return new_data

def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData:
"""Returns the updated node data with parent and end comments"""
new_data = copy.deepcopy(data)

# Get parent comments (part of an entry block and just above/between the parents lines)
parent_comments = []
while new_data.preceding_lines and new_data.preceding_lines[-1] != "":
parent_comments.append(new_data.preceding_lines.pop())
if parent_comments:
new_data.comments["parent_comments"] = parent_comments[::-1]

# Get end comments (part of an entry block after the last tag/prop
# and before the separating blank line)
end_comments = [comment[1] for comment in new_data.comments_stack]
if end_comments:
new_data.comments["end_comments"] = end_comments

return new_data
if comments:
data.comments.setdefault(f"{key}_comments", []).extend(comments)
# reset the comments list
comments[:] = []

_language_code_prefix = re.compile(
r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:"
Expand All @@ -247,17 +224,35 @@ def is_entry_synonyms_line(self, line):
matching_prefix = self._language_code_prefix.match(line)
if matching_prefix:
# verify it's not a property, that is a name followed by a colon and a language
# we need no-qa this because of black vs flake8 opinion
return not (
self._language_code_prefix.match(line[matching_prefix.end() :]) # noqa: E203
)
return False

def finalize_data(self, data, comments, saved_nodes):
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
self._add_comments(data, comments, "end")
if data.id in saved_nodes:
# this duplicate node will be merged with the first one
data.is_before = None
msg = (
f"WARNING: Entry with same id {data.id} already exists, "
f"duplicate id in file at line {data.src_position}. "
"The two nodes will be merged, keeping the last "
"values in case of conflicts."
)
self.parser_logger.error(msg)
else:
saved_nodes.append(data.id)
return data

def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
"""Transform data from file to dictionary"""
saved_nodes = []
index_stopwords = 0
index_synonyms = 0
comments = []

# Check if it is correctly written
correctly_written = re.compile(r"\w+\Z")
Expand All @@ -268,39 +263,27 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
line_number = (
entries_start_line # if the iterator is empty, line_number will not be unbound
)
for line_number, line in self._file_iter(filename, entries_start_line):
for line_number, raw_line in self._file_iter(filename, entries_start_line):
# yield data if block ended
if self._entry_end(line, data):
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
data = self._get_node_data_with_parent_and_end_comments(data)
if data.id in saved_nodes:
# this duplicate node will be merged with the first one
data.is_before = None
msg = (
f"WARNING: Entry with same id {data.id} already exists, "
f"duplicate id in file at line {data.src_position}. "
"The two nodes will be merged, keeping the last "
"values in case of conflicts."
)
self.parser_logger.error(msg)
else:
saved_nodes.append(data.id)
is_before = data.id
yield data # another function will use this dictionary to create a node
if self._entry_end(raw_line, data):
is_before = data.is_before
# another function will use data to create a node
yield self.finalize_data(data, comments, saved_nodes)
# if data was a duplicate (is_before is None) reuse same is_before
is_before = data.id if data.is_before else is_before
data = NodeData(is_before=is_before)

# harvest the line
if not (line) or line[0] == "#":
if not (raw_line.strip()) or raw_line[0] == "#":
# comment or blank line
if data.id:
if data.is_started:
# we are within the node definition
data.comments_stack.append((line_number, line))
comments.append(raw_line)
else:
# we are before the actual node
data.preceding_lines.append(line)
data.preceding_lines.append(raw_line)
else:
line = line.rstrip(",")
line = self.prepare_line(raw_line)
if not data.src_position:
data.src_position = line_number + 1
if line.startswith("stopwords"):
Expand Down Expand Up @@ -342,6 +325,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
elif line[0] == "<":
# parent definition
data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
self._add_comments(data, comments, "parent")
elif self.is_entry_synonyms_line(line):
# synonyms definition
if not data.id:
Expand All @@ -363,9 +347,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tagsids_list.append(word_normalized)
data.tags["tags_" + lang] = tags_list
data.tags["tags_ids_" + lang] = tagsids_list
data = self._get_node_data_with_comments_above_key(
data, line_number, "tags_" + lang
)
self._add_comments(data, comments, "tags_" + lang)
else:
# property definition
property_name = None
Expand All @@ -384,17 +366,15 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
correctly_written.match(property_name) and correctly_written.match(lc)
):
self.parser_logger.error(
f"Reading error at line {line_number + 1},"
f"Reading error at line {line_number + 1}, "
f"unexpected format: '{self.parser_logger.ellipsis(line)}'"
)
if property_name:
prop_key = "prop_" + property_name + "_" + lc
data.properties[prop_key] = self.undo_normalize_text(
property_value.strip()
)
data = self._get_node_data_with_comments_above_key(
data, line_number, prop_key
)
self._add_comments(data, comments, prop_key)

data.id = "__footer__"
data.preceding_lines.pop(0)
Expand Down
4 changes: 2 additions & 2 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_tags_line(self, node, lc):

@staticmethod
def property_sort_key(property):
name, lang_code, *_ = property.split("_", 2)
name, lang_code = property.rsplit("_", 1)
# give priority to xx and en language codes
priority = {"en": 1, "xx": 0}
return (name, priority.get(lang_code, 100), lang_code)
Expand Down Expand Up @@ -88,7 +88,7 @@ def get_parents_lines(self, parents):
parent = dict(parent)
lc = parent["main_language"]
parent_id = parent["tags_" + lc][0]
yield "<" + lc + ": " + parent_id
yield "< " + lc + ":" + parent_id

def iter_lines(self, project_label):
previous_block_id = ""
Expand Down
16 changes: 8 additions & 8 deletions parser/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,33 @@ synonyms:en: passion fruit, passionfruit

synonyms:fr: fruit de la passion, maracuja, passion

<en: milk
< en:milk
en: yogurts, yoghurts
fr: yaourts, yoghourts, yogourts

<en: yogurts
< en:yogurts
en: banana yogurts
fr: yaourts à la banane

<en: yogurts
< en:yogurts
en: Passion fruit yogurts
fr: yaourts au fruit de la passion

<fr: yaourts fruit de la passion
< fr:yaourts fruit de la passion
fr: yaourts au fruit de la passion allégés

# meat
# meat

en: meat
carbon_footprint_fr_foodges_value:fr: 10
vegan:en: no

<en: meat
< en:meat
en: fake-meat
vegan:en: yes

en: fake-stuff

<en: fake-stuff
<en: fake-meat
< en:fake-stuff
< en:fake-meat
en: fake-duck-meat
9 changes: 9 additions & 0 deletions parser/tests/data/test_comment_below_parent.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# test a bug found with ingr taxonomy

en:milk

# a comment above the parent
<en: milk
# a comment below the parent
en: Cow milk
fr: lait de vache
Loading

0 comments on commit d366218

Please sign in to comment.