Merge pull request #211 from Inist-CNRS/services/chem-ner/fix-bug

fix(chem-ner): fix how to get entities
Inist-CNRS · Nov 26, 2024 · 1179f77 · 1179f77
2 parents 91b36d3 + fb5eed9
commit 1179f77
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 23 deletions.
diff --git a/services/chem-ner/README.md b/services/chem-ner/README.md
@@ -1,4 +1,4 @@
-# [email protected].0
+# [email protected].1
 
 Trouve des entités nommées de Chimie dans un texte
 

diff --git a/services/chem-ner/package.json b/services/chem-ner/package.json
@@ -1,7 +1,7 @@
 {
     "private": true,
     "name": "ws-chem-ner",
-    "version": "4.0.0",
+    "version": "4.0.1",
     "description": "Trouve des entités nommées de Chimie dans un texte",
     "repository": {
         "type": "git",

diff --git a/services/chem-ner/swagger.json b/services/chem-ner/swagger.json
@@ -3,7 +3,7 @@
     "info": {
         "title": "chem-ner - Trouve des entités nommées de Chimie dans un texte",
         "summary": "Renvoie un Json composé comportant d'un champ \"chemical\" et d'un champ \"chemical_disambiguisate\"",
-        "version": "4.0.0",
+        "version": "4.0.1",
         "termsOfService": "https://services.istex.fr/",
         "contact": {
             "name": "Inist-CNRS",
@@ -15,7 +15,7 @@
             "x-comment": "Will be automatically completed by the ezs server."
         },
         {
-            "url": "http://vptdmservices.intra.inist.fr:49306/",
+            "url": "http://vptdmservices.intra.inist.fr:49323/",
             "description": "Latest version for production",
             "x-profil": "Standard"
         }

diff --git a/services/chem-ner/v1/chem/tagger.py b/services/chem-ner/v1/chem/tagger.py
@@ -43,25 +43,34 @@ def predict_formula_ml(input_text):
 
     predictions = torch.argmax(output.logits, dim=-1)
 
-    # Get token that contains "CHEMICAL"
-    tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
-    chemical_tokens_list = []
-    i=0
-
-    while i < len(predictions[0]):
-        # prediction [0][i] depends of i : {0 : "B-CHEMICAL" , 1 : "I-CHEMICAL" , 2: "NOT a chemical NE"}
-        k=0
-        if predictions[0][i] < 2:
-            chemical_tokens_toappend = []
-            while predictions[0][i+k] < 2:
-                chemical_tokens_toappend.append(tokens[i+k])
-                k+=1
-            chemical_tokens_list.append(chemical_tokens_toappend)
-        i+=k+1
-    value = []
-    for chemical_tokens in chemical_tokens_list:
-        value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(chemical_tokens)))
-    return value
+    #convert the predictions to labels
+    predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
+
+    chemical_entities = []
+    current_entity = []
+
+    # Iterate over both tokens and entity directly
+    for token, label in zip(tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]), predicted_labels):
+        if label.startswith("B-"):  # Beginning of an entity
+            if current_entity:
+                chemical_entities.append(current_entity)
+                current_entity = []
+            current_entity.append(token)
+        elif label.startswith("I-") and current_entity:  # Continuation of an entity
+            current_entity.append(token)
+        else:
+            if current_entity:
+                chemical_entities.append(current_entity)
+                current_entity = []
+
+    # If there's an entity left at the end (here was a bug with last version)
+    if current_entity:
+        chemical_entities.append(current_entity)
+
+    # Convert tokens back to string format
+    chemical_entities = [tokenizer.convert_tokens_to_string(entity_tokens) for entity_tokens in chemical_entities]
+
+    return chemical_entities
 
 # if text too long
 def split_text(text):