Skip to content

Commit

Permalink
Merge pull request #211 from Inist-CNRS/services/chem-ner/fix-bug
Browse files Browse the repository at this point in the history
fix(chem-ner): fix how to get entities
  • Loading branch information
parmentf authored Nov 26, 2024
2 parents 91b36d3 + fb5eed9 commit 1179f77
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 23 deletions.
2 changes: 1 addition & 1 deletion services/chem-ner/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# [email protected].0
# [email protected].1

Trouve des entités nommées de Chimie dans un texte

Expand Down
2 changes: 1 addition & 1 deletion services/chem-ner/package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"private": true,
"name": "ws-chem-ner",
"version": "4.0.0",
"version": "4.0.1",
"description": "Trouve des entités nommées de Chimie dans un texte",
"repository": {
"type": "git",
Expand Down
4 changes: 2 additions & 2 deletions services/chem-ner/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"info": {
"title": "chem-ner - Trouve des entités nommées de Chimie dans un texte",
"summary": "Renvoie un Json composé comportant d'un champ \"chemical\" et d'un champ \"chemical_disambiguisate\"",
"version": "4.0.0",
"version": "4.0.1",
"termsOfService": "https://services.istex.fr/",
"contact": {
"name": "Inist-CNRS",
Expand All @@ -15,7 +15,7 @@
"x-comment": "Will be automatically completed by the ezs server."
},
{
"url": "http://vptdmservices.intra.inist.fr:49306/",
"url": "http://vptdmservices.intra.inist.fr:49323/",
"description": "Latest version for production",
"x-profil": "Standard"
}
Expand Down
47 changes: 28 additions & 19 deletions services/chem-ner/v1/chem/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,34 @@ def predict_formula_ml(input_text):

predictions = torch.argmax(output.logits, dim=-1)

# Get token that contains "CHEMICAL"
tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
chemical_tokens_list = []
i=0

while i < len(predictions[0]):
# prediction [0][i] depends of i : {0 : "B-CHEMICAL" , 1 : "I-CHEMICAL" , 2: "NOT a chemical NE"}
k=0
if predictions[0][i] < 2:
chemical_tokens_toappend = []
while predictions[0][i+k] < 2:
chemical_tokens_toappend.append(tokens[i+k])
k+=1
chemical_tokens_list.append(chemical_tokens_toappend)
i+=k+1
value = []
for chemical_tokens in chemical_tokens_list:
value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(chemical_tokens)))
return value
#convert the predictions to labels
predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]

chemical_entities = []
current_entity = []

# Iterate over both tokens and entity directly
for token, label in zip(tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]), predicted_labels):
if label.startswith("B-"): # Beginning of an entity
if current_entity:
chemical_entities.append(current_entity)
current_entity = []
current_entity.append(token)
elif label.startswith("I-") and current_entity: # Continuation of an entity
current_entity.append(token)
else:
if current_entity:
chemical_entities.append(current_entity)
current_entity = []

# If there's an entity left at the end (here was a bug with last version)
if current_entity:
chemical_entities.append(current_entity)

# Convert tokens back to string format
chemical_entities = [tokenizer.convert_tokens_to_string(entity_tokens) for entity_tokens in chemical_entities]

return chemical_entities

# if text too long
def split_text(text):
Expand Down

0 comments on commit 1179f77

Please sign in to comment.