From bd7f83bd7f7eacc7f5e1a6be4f0393798029102f Mon Sep 17 00:00:00 2001 From: leogail Date: Fri, 22 Nov 2024 15:51:19 +0100 Subject: [PATCH 1/3] fix(chem-ner): fix how to get entities --- services/chem-ner/v1/chem/tagger.py | 47 +++++++++++++++++------------ 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/services/chem-ner/v1/chem/tagger.py b/services/chem-ner/v1/chem/tagger.py index 3cb64a2f..e615bc38 100755 --- a/services/chem-ner/v1/chem/tagger.py +++ b/services/chem-ner/v1/chem/tagger.py @@ -43,25 +43,34 @@ def predict_formula_ml(input_text): predictions = torch.argmax(output.logits, dim=-1) - # Get token that contains "CHEMICAL" - tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]) - chemical_tokens_list = [] - i=0 - - while i < len(predictions[0]): - # prediction [0][i] depends of i : {0 : "B-CHEMICAL" , 1 : "I-CHEMICAL" , 2: "NOT a chemical NE"} - k=0 - if predictions[0][i] < 2: - chemical_tokens_toappend = [] - while predictions[0][i+k] < 2: - chemical_tokens_toappend.append(tokens[i+k]) - k+=1 - chemical_tokens_list.append(chemical_tokens_toappend) - i+=k+1 - value = [] - for chemical_tokens in chemical_tokens_list: - value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(chemical_tokens))) - return value + #convert the predictions to labels + predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]] + + chemical_entities = [] + current_entity = [] + + # Iterate over both tokens and entity directly + for token, label in zip(tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]), predicted_labels): + if label.startswith("B-"): # Beginning of an entity + if current_entity: + chemical_entities.append(current_entity) + current_entity = [] + current_entity.append(token) + elif label.startswith("I-") and current_entity: # Continuation of an entity + current_entity.append(token) + else: + if current_entity: + chemical_entities.append(current_entity) + current_entity = [] + + # If there's an entity left at the end (here was a bug with last version) + if current_entity: + chemical_entities.append(current_entity) + + # Convert tokens back to string format + chemical_entities = [tokenizer.convert_tokens_to_string(entity_tokens) for entity_tokens in chemical_entities] + + return chemical_entities # if text too long def split_text(text): From ed3b0538326cc519db29646e3e7cfd5de62486e9 Mon Sep 17 00:00:00 2001 From: leogail Date: Tue, 26 Nov 2024 10:11:32 +0100 Subject: [PATCH 2/3] release ws-chem-ner@4.0.1 --- services/chem-ner/README.md | 2 +- services/chem-ner/package.json | 2 +- services/chem-ner/swagger.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/chem-ner/README.md b/services/chem-ner/README.md index e7ea1116..6abf6466 100644 --- a/services/chem-ner/README.md +++ b/services/chem-ner/README.md @@ -1,4 +1,4 @@ -# ws-chem-ner@4.0.0 +# ws-chem-ner@4.0.1 Trouve des entités nommées de Chimie dans un texte diff --git a/services/chem-ner/package.json b/services/chem-ner/package.json index 060a0290..53b53d89 100644 --- a/services/chem-ner/package.json +++ b/services/chem-ner/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "ws-chem-ner", - "version": "4.0.0", + "version": "4.0.1", "description": "Trouve des entités nommées de Chimie dans un texte", "repository": { "type": "git", diff --git a/services/chem-ner/swagger.json b/services/chem-ner/swagger.json index a1f41733..378fd1f2 100644 --- a/services/chem-ner/swagger.json +++ b/services/chem-ner/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "chem-ner - Trouve des entités nommées de Chimie dans un texte", "summary": "Renvoie un Json composé comportant d'un champ \"chemical\" et d'un champ \"chemical_disambiguisate\"", - "version": "4.0.0", + "version": "4.0.1", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", From fb5eed9971997d3d85a1537782219c0a755b7085 Mon Sep 17 00:00:00 2001 From: leogail Date: Tue, 26 Nov 2024 10:23:35 +0100 Subject: [PATCH 3/3] chore(chem-ner): update port --- services/chem-ner/swagger.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/chem-ner/swagger.json b/services/chem-ner/swagger.json index 378fd1f2..5e4c2ab0 100644 --- a/services/chem-ner/swagger.json +++ b/services/chem-ner/swagger.json @@ -15,7 +15,7 @@ "x-comment": "Will be automatically completed by the ezs server." }, { - "url": "http://vptdmservices.intra.inist.fr:49306/", + "url": "http://vptdmservices.intra.inist.fr:49323/", "description": "Latest version for production", "x-profil": "Standard" }