From e3ea0e222030980c315e3390ce1ec926445c39ed Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 25 Jul 2023 13:42:40 -0400 Subject: [PATCH 1/2] Enable returning all NER results --- gilda/ner.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gilda/ner.py b/gilda/ner.py index c2c83c6..d542e46 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -69,6 +69,7 @@ def annotate( sent_split_fun=None, organisms=None, namespaces=None, + return_first: bool = True ) -> List[Annotation]: """Annotate a given text with Gilda. @@ -89,6 +90,8 @@ def annotate( namespaces : list[str], optional A list of namespaces to pass to the grounder to restrict the matches to. By default, no restriction is applied. + return_first: + If true, only returns the first result. Otherwise, returns all results. Returns ------- @@ -139,11 +142,12 @@ def annotate( len(raw_words[idx+span-1]) raw_span = ' '.join(raw_words[idx:idx+span]) - # Append raw_span, (best) match, start, end - match = matches[0] - entities.append( - (raw_span, match, start_coord, end_coord) - ) + if return_first: + matches = [matches[0]] + for match in matches: + entities.append( + (raw_span, match, start_coord, end_coord) + ) skip_until = idx + span break From 40e1936b5a51f31351caf8f276cbac222bc478d3 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 25 Jul 2023 13:53:18 -0400 Subject: [PATCH 2/2] Add test for return_first --- gilda/api.py | 6 +++++- gilda/ner.py | 2 +- gilda/tests/test_ner.py | 15 ++++++++++++++- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/gilda/api.py b/gilda/api.py index 5906d02..28d1bf2 100644 --- a/gilda/api.py +++ b/gilda/api.py @@ -112,6 +112,7 @@ def annotate( sent_split_fun=None, organisms=None, namespaces=None, + return_first: bool = True, ): """Annotate a given text with Gilda (i.e., do named entity recognition). @@ -130,6 +131,8 @@ def annotate( namespaces : list[str], optional A list of namespaces to pass to the grounder to restrict the matches to. By default, no restriction is applied. + return_first: + If true, only returns the first result. Otherwise, returns all results. Returns ------- @@ -145,7 +148,8 @@ def annotate( grounder=grounder, sent_split_fun=sent_split_fun, organisms=organisms, - namespaces=namespaces + namespaces=namespaces, + return_first=return_first, ) diff --git a/gilda/ner.py b/gilda/ner.py index d542e46..d2a2711 100644 --- a/gilda/ner.py +++ b/gilda/ner.py @@ -69,7 +69,7 @@ def annotate( sent_split_fun=None, organisms=None, namespaces=None, - return_first: bool = True + return_first: bool = True, ) -> List[Annotation]: """Annotate a given text with Gilda. diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py index 5d6b508..e3b115c 100644 --- a/gilda/tests/test_ner.py +++ b/gilda/tests/test_ner.py @@ -1,7 +1,7 @@ from textwrap import dedent import gilda -from gilda.ner import annotate, get_brat +from gilda.ner import get_brat def test_annotate(): @@ -62,3 +62,16 @@ def test_get_brat(): #7\tAnnotatorNotes T7\tCHEBI:36080 """).lstrip() assert brat_str == match_str + + +def test_get_all(): + full_text = "This is about ER." + results = gilda.annotate(full_text, return_first=False) + assert len(results) > 1 + curies = { + scored_match.term.get_curie() + for _, scored_match, _, _ in results + } + assert "hgnc:3467" in curies # ESR1 + assert "fplx:ESR" in curies + assert "GO:0005783" in curies # endoplasmic reticulum