Merge pull request #120 from cthoyt/return-first

Enable returning all NER results
gyorilab · Jul 26, 2023 · ddea0cc · ddea0cc
2 parents 125ab65 + 40e1936
commit ddea0cc
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 7 deletions.
diff --git a/gilda/api.py b/gilda/api.py
@@ -112,6 +112,7 @@ def annotate(
     sent_split_fun=None,
     organisms=None,
     namespaces=None,
+    return_first: bool = True,
 ):
     """Annotate a given text with Gilda (i.e., do named entity recognition).
 
@@ -130,6 +131,8 @@ def annotate(
     namespaces : list[str], optional
         A list of namespaces to pass to the grounder to restrict the matches
         to. By default, no restriction is applied.
+    return_first:
+        If true, only returns the first result. Otherwise, returns all results.
 
     Returns
     -------
@@ -145,7 +148,8 @@ def annotate(
         grounder=grounder,
         sent_split_fun=sent_split_fun,
         organisms=organisms,
-        namespaces=namespaces
+        namespaces=namespaces,
+        return_first=return_first,
     )
 
 

diff --git a/gilda/ner.py b/gilda/ner.py
@@ -69,6 +69,7 @@ def annotate(
     sent_split_fun=None,
     organisms=None,
     namespaces=None,
+    return_first: bool = True,
 ) -> List[Annotation]:
     """Annotate a given text with Gilda.
 
@@ -89,6 +90,8 @@ def annotate(
     namespaces : list[str], optional
         A list of namespaces to pass to the grounder to restrict the matches
         to. By default, no restriction is applied.
+    return_first:
+        If true, only returns the first result. Otherwise, returns all results.
 
     Returns
     -------
@@ -139,11 +142,12 @@ def annotate(
                         len(raw_words[idx+span-1])
                     raw_span = ' '.join(raw_words[idx:idx+span])
 
-                    # Append raw_span, (best) match, start, end
-                    match = matches[0]
-                    entities.append(
-                        (raw_span, match, start_coord, end_coord)
-                    )
+                    if return_first:
+                        matches = [matches[0]]
+                    for match in matches:
+                        entities.append(
+                            (raw_span, match, start_coord, end_coord)
+                        )
 
                     skip_until = idx + span
                     break

diff --git a/gilda/tests/test_ner.py b/gilda/tests/test_ner.py
@@ -1,7 +1,7 @@
 from textwrap import dedent
 
 import gilda
-from gilda.ner import annotate, get_brat
+from gilda.ner import get_brat
 
 
 def test_annotate():
@@ -62,3 +62,16 @@ def test_get_brat():
         #7\tAnnotatorNotes T7\tCHEBI:36080
         """).lstrip()
     assert brat_str == match_str
+
+
+def test_get_all():
+    full_text = "This is about ER."
+    results = gilda.annotate(full_text, return_first=False)
+    assert len(results) > 1
+    curies = {
+        scored_match.term.get_curie()
+        for _, scored_match, _, _ in results
+    }
+    assert "hgnc:3467" in curies  # ESR1
+    assert "fplx:ESR" in curies
+    assert "GO:0005783" in curies  # endoplasmic reticulum