Merge pull request #189 from Inist-CNRS/services/data-computer/group-…

…by-fix fix group-by
Inist-CNRS · Oct 21, 2024 · 9a4dbc5 · 9a4dbc5
2 parents 718e472 + 85fa3de
commit 9a4dbc5
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 94 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/services/data-computer/README.md b/services/data-computer/README.md
@@ -1,10 +1,9 @@
-# [email protected].0
+# [email protected].1
 
 Le service `data-computer` offre plusieurs services **asynchrones** pour des calculs et de transformations de données simples.
 
 *Tous les services proposés acceptent uniquement en entrée des fichiers corpus standards au format tar.gz.*
 
-
 ## Utilisation
 
 - [v1/tree-segment](#v1%2ftree-segment)
@@ -21,25 +20,24 @@ par exemple avec ces données en entrée:
 
 ```json
 [
-	{ "value": ["a", "b", "c"] },
-	{ "value": ["a", "c", "d"] },
-	{ "value": ["a", "b", "d"] },
-	{ "value": ["a", "b", "c", "d"] },
-	{ "value": ["a", "c", "d", "e"] }
+  { "value": ["a", "b", "c"] },
+  { "value": ["a", "c", "d"] },
+  { "value": ["a", "b", "d"] },
+  { "value": ["a", "b", "c", "d"] },
+  { "value": ["a", "c", "d", "e"] }
 ]
-
 ```
 
 on obtiendra :
 
 ```json
 [
-	{"source":"a","target":"b","weight":3,"origin":["#1","#3","#4"]},
-	{"source":"b","target":"c","weight":2,"origin":["#1","#4"]},
-	{"source":"a","target":"c","weight":2,"origin":["#2","#5"]},
-	{"source":"c","target":"d","weight":3,"origin":["#2","#4","#5"]},
-	{"source":"b","target":"d","weight":1,"origin":["#3"]},
-	{"source":"d","target":"e","weight":1,"origin":["#5"]}
+  {"source":"a","target":"b","weight":3,"origin":["#1","#3","#4"]},
+  {"source":"b","target":"c","weight":2,"origin":["#1","#4"]},
+  {"source":"a","target":"c","weight":2,"origin":["#2","#5"]},
+  {"source":"c","target":"d","weight":3,"origin":["#2","#4","#5"]},
+  {"source":"b","target":"d","weight":1,"origin":["#3"]},
+  {"source":"d","target":"e","weight":1,"origin":["#5"]}
 ]
 ```
 
@@ -59,7 +57,6 @@ on obtiendra :
 
 #### Exemple en ligne de commande
 
-
 ```bash
 # Send data for batch processing
 cat input.tar.gz |curl --data-binary @-  -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/tree-segment" > output.json
@@ -78,28 +75,27 @@ par exemple avec ces données en entrée:
 
 ```json
 [
-	{ "value": ["a", "b", "c"] },
-	{ "value": ["a", "c", "d"] },
-	{ "value": ["a", "b", "d"] },
-	{ "value": ["a", "b", "c", "d"] },
-	{ "value": ["a", "c", "d", "e"] }
+  { "value": ["a", "b", "c"] },
+  { "value": ["a", "c", "d"] },
+  { "value": ["a", "b", "d"] },
+  { "value": ["a", "b", "c", "d"] },
+  { "value": ["a", "c", "d", "e"] }
 ]
-
 ```
 
 on obtiendra :
 
 ```json
 [
-	{"source":"a","target":"b","weight":3,"origin":["#1","#3","#4"]},
-	{"source":"a","target":"c","weight":4,"origin":["#1","#2","#4","#5"]},
-	{"source":"b","target":"c","weight":2,"origin":["#1","#4"]},
-	{"source":"a","target":"d","weight":4,"origin":["#2","#3","#4","#5"]},
-	{"source":"c","target":"d","weight":3,"origin":["#2","#4","#5"]},
-	{"source":"b","target":"d","weight":2,"origin":["#3","#4"]},
-	{"source":"a","target":"e","weight":1,"origin":["#5"]},
-	{"source":"c","target":"e","weight":1,"origin":["#5"]},
-	{"source":"d","target":"e","weight":1,"origin":["#5"]}
+  {"source":"a","target":"b","weight":3,"origin":["#1","#3","#4"]},
+  {"source":"a","target":"c","weight":4,"origin":["#1","#2","#4","#5"]},
+  {"source":"b","target":"c","weight":2,"origin":["#1","#4"]},
+  {"source":"a","target":"d","weight":4,"origin":["#2","#3","#4","#5"]},
+  {"source":"c","target":"d","weight":3,"origin":["#2","#4","#5"]},
+  {"source":"b","target":"d","weight":2,"origin":["#3","#4"]},
+  {"source":"a","target":"e","weight":1,"origin":["#5"]},
+  {"source":"c","target":"e","weight":1,"origin":["#5"]},
+  {"source":"d","target":"e","weight":1,"origin":["#5"]}
 ]
 ```
 
@@ -119,53 +115,47 @@ on obtiendra :
 
 #### Exemple en ligne de commande
 
-
 ```bash
 # Send data for batch processing
 cat input.tar.gz |curl --data-binary @-  -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/graph-segment" > output.json
 
 # When the corpus is processed, get the result
 cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz
-
 ```
 
-
 ### v1/lda
 
 Créer à partir de l'ensemble des documents un ensemble de topics. Chaque topic contient un champ "word", qui est composé une liste de 10 mots qui sont les plus caractéristiques du topic, ainsi que d'un champ "weight" qui correspond au poids associé au sujet dans le document. Le texte doit être en anglais. Les topics non exhaustifs (dont la probabilité est inférieure ou égale à 0.05) ne sont pas retournés.
 La liste des topics sont affichés dans le champ "topics" et le topic avec la plus forte probabilité est retourné dans un champ "best_topic"
 
-
 Par exemple, pour un document pris dans un ensemble de document (l'id "83" est totalement arbitraire)
 
 ```json
-
 {
 "id":"83",
 "value":"The current status and distribution of the red panda Ailurus fulgens in the wild is poorly known. The subspecies fulgens is found in the Himalaya in Nepal, India, Bhutan, northern Myanmar and southwest China, and the subspecies styani occurs further to the east in south-central China. The red panda is an animal of subtropical and temperate forests, with the exception of Meghalaya in India, where it is also found in tropical forests. In the wild, red pandas take a largely vegetarian diet consisting chiefly of bamboo. The extent of occurrence of the red panda in India is about 170,000 sq km, although its area of occupancy within this may only be about 25,000 sq km. An estimate based on the lowest recorded average density and the total area of potential habitat suggests that the global population of red pandas is about 16,000–20,000. Habitat loss and poaching, in that order, are the major threats. In this paper the distribution, status and conservation problems of the red panda, especially in India, are reviewed, and appropriate conservation measures recommended, including the protection of named areas and the extension of some existing protected areas."
 }
 ```
 
 On obtiendra :
-```json
 
+```json
 {
-"id":"83",
-"value":{
-	"topics":{
-		"topic_6":{"words":["diet","animal","high","group","level","study","blood","dietary","intake","increase"],"weight":"0.9416929"},
-		"topic_13":{"words":["diet","intake","human","b12","food","level","protein","vitamin","increase","acid"],"weight":"0.05131816"}
-		},
-	"best_topic": {
-		"topic_6":{"words":["diet","animal","high","group","level","study","blood","dietary","intake","increase"],"weight":"0.9416929"}
-	}
-}
+  "id":"83",
+  "value":{
+    "topics":{
+      "topic_6":{"words":["diet","animal","high","group","level","study","blood","dietary","intake","increase"],"weight":"0.9416929"},
+      "topic_13":{"words":["diet","intake","human","b12","food","level","protein","vitamin","increase","acid"],"weight":"0.05131816"}
+    },
+    "best_topic": {
+      "topic_6":{"words":["diet","animal","high","group","level","study","blood","dietary","intake","increase"],"weight":"0.9416929"}
+    }
+  }
 }
 ```
 
 NOTE : La qualité des résultats dépend du corpus et les topics doivent être analysés par l'utilisateur avant d'être utilisés.
 
-
 #### Paramètre(s) URL
 
 | nom                 | description                                 |
@@ -180,7 +170,6 @@ NOTE : La qualité des résultats dépend du corpus et les topics doivent être
 
 #### Exemple en ligne de commande
 
-
 ```bash
 # Send data for batch processing
 cat input.tar.gz |curl --data-binary @-  -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/lda" > output.json
@@ -189,52 +178,49 @@ cat input.tar.gz |curl --data-binary @-  -H "X-Hook: https://webhook.site/dce2fe
 cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz
 ```
 
-
 ### v1/rapido
 
 Web service à destination du projet rapido. Ce web service prend en entrée un tar.gz comportant un dossier data incluant tout les documents xml à traiter. Il renvoit un json comportant les alignements que l'algorithme a pu faire entre le texte et le référentiel idRef.
 
 Par exemple, en utilisant example-xml-rapido.tar.gz,
 On obtiendra :
-```json
-
 
+```json
 {
-	"idArticle": "bch_0007-4217_2003_num_127_2_9424",
-	"title": "Aséa",
-	"sites": [],
-	"entite": [
-		{
-			"name": "ville basse",
-			"occurences": [
-				{
-					"page": "bch_0007-4217_2003_num_127_2_T1_0778_0000",
-					"text": " papamarinopoulos ( université de patras ) ont entrepris un projet commun de prospection géophysique dans la **ville basse** d' aséa dans le but de retrouver les sections de l' enceinte recouverte par une couche d' alluvions stériles déposées par l' alphée ."
-				}
-			],
-			"notice": "https://www.idref.fr/192337963.rdf",
-			"score": "PP(0)"
-		},
-		{
-			"name": "patras",
-			"occurences": [
-				{
-					"page": "bch_0007-4217_2003_num_127_2_T1_0778_0000",
-					"text": " papamarinopoulos ( université de **patras** ) ont entrepris un projet commun de prospection géophysique dans la ville basse d' aséa dans le but de retrouver les sections de l' enceinte recouverte par une couche d' alluvions stériles déposées par l' alphée ."
-				},
-				{
-					"page": "bch_0007-4217_2003_num_127_2_T1_0778_0000",
-					"text": " les données recueillies en 2002 ont été traitées au laboratoire de géophysique du département de géologie de l' université de **patras** ."
-				}
-			],
-			"notice": "https://www-dev.idref.fr/050189484.rdf",
-			"score": "PP(0)"
-		}
-	]
+  "idArticle": "bch_0007-4217_2003_num_127_2_9424",
+  "title": "Aséa",
+  "sites": [],
+  "entite": [
+    {
+      "name": "ville basse",
+      "occurences": [
+        {
+          "page": "bch_0007-4217_2003_num_127_2_T1_0778_0000",
+          "text": " papamarinopoulos ( université de patras ) ont entrepris un projet commun de prospection géophysique dans la **ville basse** d' aséa dans le but de retrouver les sections de l' enceinte recouverte par une couche d' alluvions stériles déposées par l' alphée ."
+        }
+      ],
+      "notice": "https://www.idref.fr/192337963.rdf",
+      "score": "PP(0)"
+    },
+    {
+      "name": "patras",
+      "occurences": [
+        {
+          "page": "bch_0007-4217_2003_num_127_2_T1_0778_0000",
+          "text": " papamarinopoulos ( université de **patras** ) ont entrepris un projet commun de prospection géophysique dans la ville basse d' aséa dans le but de retrouver les sections de l' enceinte recouverte par une couche d' alluvions stériles déposées par l' alphée ."
+        },
+        {
+          "page": "bch_0007-4217_2003_num_127_2_T1_0778_0000",
+          "text": " les données recueillies en 2002 ont été traitées au laboratoire de géophysique du département de géologie de l' université de **patras** ."
+        }
+      ],
+      "notice": "https://www-dev.idref.fr/050189484.rdf",
+      "score": "PP(0)"
+    }
+  ]
 }
 ```
 
-
 #### Paramètre(s) URL
 
 | nom                 | description                                 |
@@ -249,7 +235,6 @@ On obtiendra :
 
 #### Exemple en ligne de commande
 
-
 ```bash
 # Send data for batch processing
 cat input.tar.gz |curl --data-binary @-  -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/rapido" > output.json
@@ -258,19 +243,19 @@ cat input.tar.gz |curl --data-binary @-  -H "X-Hook: https://webhook.site/dce2fe
 cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz
 ```
 
-
 ### v1/corpus-similarity
 
-Compare des petits documents (Titre, phrases, petits *abstracts*) entre eux, et renvoie pour chaque document les documents qui lui sont similaires. 
+Compare des petits documents (Titre, phrases, petits *abstracts*) entre eux, et renvoie pour chaque document les documents qui lui sont similaires.  
 Il est conseillé d'utiliser cette route avec au moins 6-7 documents dans le corpus.
 
 Il existe un paramètre optionnel `output` pour choisir le type de sortie en fonction de sa valeur:
+
 - 0 (par défaut) : l'algorithme choisit automatiquement les documents les plus similaires à chaque document
 - 1 : l'algorithme renvoie pour chaque document tous les documents, classés par ordre de proximité (les plus similaires en premier)
 - *n* (avec *n* un entier plus grand que 1) : l'algorithme renvoie pour chaque document les *n* documents les plus proches, classés par ordre de proximité (les plus similaires en premier), ainsi que le score de similarité associé à chaque document.
 par exemple en utilisant `example-similarity-json.tar.gz` avec le paramètre output par défaut (0), obtiendra :
 
-> **Attention** : Le champ ID est utilisé comme référence de chaque document. 
+> **Attention** : Le champ ID est utilisé comme référence de chaque document.
 
 par exemple en utilisant `example-similarity-json.tar.gz` avec le paramètre output par défaut (0), obtiendra :
 
@@ -325,7 +310,7 @@ par exemple en utilisant `example-similarity-json.tar.gz` avec le paramètre out
 ]
 ```
 
-Avec le paramètre output=3, on obtiendra : 
+Avec le paramètre output=3, on obtiendra :
 
 ```json
 [
@@ -407,7 +392,6 @@ Avec le paramètre output=3, on obtiendra :
 
 #### Exemple en ligne de commande
 
-
 ```bash
 # Send data for batch processing
 cat input.tar.gz |curl --data-binary @-  -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/similarity" > output.json

diff --git a/services/data-computer/package.json b/services/data-computer/package.json
@@ -1,7 +1,7 @@
 {
   "private": true,
   "name": "ws-data-computer",
-  "version": "2.16.0",
+  "version": "2.16.1",
   "description": "Calculs sur fichier corpus compressé",
   "repository": {
     "type": "git",

diff --git a/services/data-computer/swagger.json b/services/data-computer/swagger.json
@@ -3,7 +3,7 @@
     "info": {
         "title": "data-computer - Calculs sur fichier corpus compressé",
         "summary": "Calculs sur un corpus compressé",
-        "version": "2.16.0",
+        "version": "2.16.1",
         "termsOfService": "https://services.istex.fr/",
         "contact": {
             "name": "Inist-CNRS",
@@ -15,7 +15,7 @@
             "x-comment": "Will be automatically completed by the ezs server."
         },
         {
-            "url": "http://vptdmjobs.intra.inist.fr:49196/",
+            "url": "http://vptdmjobs.intra.inist.fr:49197/",
             "description": "Latest version for production",
             "x-profil": "Standard"
         }

diff --git a/services/data-computer/tests.hurl b/services/data-computer/tests.hurl
@@ -190,6 +190,6 @@ delay: 2000
 ```
 
 HTTP 200
-[{"id":"#1","value":["#1","#4"]},{"id":"#4","value":["#1","#4"]},{"id":"#2","value":["#2","#5"]},{"id":"#5","value":["#2","#5"]},{"id":"#3","value":["#3"]}]
+[{"id":"#1","value":["#4"]},{"id":"#4","value":["#1"]},{"id":"#2","value":["#5"]},{"id":"#5","value":["#2"]},{"id":"#3","value":[]}]
 
 #
diff --git a/services/data-computer/v1/group-by.ini b/services/data-computer/v1/group-by.ini
@@ -84,6 +84,13 @@ value = get('id').last()
 # Step 2.1.3 (spécifique): On agrège le tout
 [fork/delegate/aggregate]
 
+# Step 2.1.4 (spécifique): le champ id étant toujours présent dans le champ value on le supprime
+[fork/delegate/replace]
+path = id
+value = get('id')
+path = value
+value = get('value').xor([self.id])
+
 [fork/transit]
 
 # Step 2.2 (générique): Enregistrer le résultat et signaler que le traitement est fini