diff --git a/package-lock.json b/package-lock.json index 5fa6eada..34a19ea0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4481,7 +4481,7 @@ }, "services/data-computer": { "name": "ws-data-computer", - "version": "2.15.0", + "version": "2.16.0", "license": "MIT" }, "services/data-termsuite": { @@ -4490,6 +4490,7 @@ "license": "MIT" }, "services/data-topcitation": { + "name": "ws-data-topcitation", "version": "1.0.2", "license": "MIT" }, diff --git a/services/data-computer/README.md b/services/data-computer/README.md index 78596844..eb822168 100644 --- a/services/data-computer/README.md +++ b/services/data-computer/README.md @@ -1,4 +1,4 @@ -# ws-data-computer@2.15.0 +# ws-data-computer@2.16.0 Le service `data-computer` offre plusieurs services **asynchrones** pour des calculs et de transformations de données simples. @@ -256,3 +256,161 @@ cat input.tar.gz |curl --data-binary @- -H "X-Hook: https://webhook.site/dce2fe # When the corpus is processed, get the result cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz +``` + + +### v1/corpus-similarity + +Compare des petits documents (Titre, phrases, petits *abstracts*) entre eux, et renvoie pour chaque document les documents qui lui sont similaires. +Il est conseillé d'utiliser cette route avec au moins 6-7 documents dans le corpus. + +Il existe un paramètre optionnel `output` pour choisir le type de sortie en fonction de sa valeur: +- 0 (par défaut) : l'algorithme choisit automatiquement les documents les plus similaires à chaque document +- 1 : l'algorithme renvoie pour chaque document tous les documents, classés par ordre de proximité (les plus similaires en premier) +- *n* (avec *n* un entier plus grand que 1) : l'algorithme renvoie pour chaque document les *n* documents les plus proches, classés par ordre de proximité (les plus similaires en premier), ainsi que le score de similarité associé à chaque document. +par exemple en utilisant `example-similarity-json.tar.gz` avec le paramètre output par défaut (0), obtiendra : + +> **Attention** : Le champ ID est utilisé comme référence de chaque document. + +par exemple en utilisant `example-similarity-json.tar.gz` avec le paramètre output par défaut (0), obtiendra : + +```json +[ + { + "id": "Titre 1", + "value": { + "similarity": [ + "Titre 4", + "Titre 2" + ], + "score": [ + 0.9411764705882353, + 0.9349112426035503 + ] + } + }, + { + "id": "Titre 2", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9349112426035503 + ] + } + }, + { + "id": "Titre 3", + "value": { + "similarity": [ + "Titre 4" + ], + "score": [ + 0.8888888888888888 + ] + } + }, + { + "id": "Titre 4", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9411764705882353 + ] + } + } +] +``` + +Avec le paramètre output=3, on obtiendra : + +```json +[ + { + "id": "Titre 1", + "value": { + "similarity": [ + "Titre 4", + "Titre 2", + "Titre 3" + ], + "score": [ + 0.9411764705882353, + 0.9349112426035503, + 0.8757396449704142 + ] + } + }, + { + "id": "Titre 2", + "value": { + "similarity": [ + "Titre 1", + "Titre 4", + "Titre 3" + ], + "score": [ + 0.9349112426035503, + 0.8888888888888888, + 0.8651685393258427 + ] + } + }, + { + "id": "Titre 3", + "value": { + "similarity": [ + "Titre 4", + "Titre 1", + "Titre 2" + ], + "score": [ + 0.8888888888888888, + 0.8757396449704142, + 0.8651685393258427 + ] + } + }, + { + "id": "Titre 4", + "value": { + "similarity": [ + "Titre 1", + "Titre 3", + "Titre 2" + ], + "score": [ + 0.9411764705882353, + 0.8888888888888888, + 0.8888888888888888 + ] + } + } +] +``` + +#### Paramètre(s) URL + +| nom | description | +| ------------------- | ------------------------------------------- | +| indent (true/false) | Indenter le résultat renvoyer immédiatement | +| output (0,1,n) | Choix de la sortie | + +#### Entête(s) HTTP + +| nom | description | +| ------ | ------------------------------------------------------------ | +| X-Hook | URL à appeler quand le résultat sera disponible (facultatif) | + +#### Exemple en ligne de commande + + +```bash +# Send data for batch processing +cat input.tar.gz |curl --data-binary @- -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/similarity" > output.json + +# When the corpus is processed, get the result +cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz diff --git a/services/data-computer/example-similarity-json.tar.gz b/services/data-computer/example-similarity-json.tar.gz new file mode 100644 index 00000000..5316bb15 Binary files /dev/null and b/services/data-computer/example-similarity-json.tar.gz differ diff --git a/services/data-computer/examples.http b/services/data-computer/examples.http index 27886c8b..d116eef2 100644 --- a/services/data-computer/examples.http +++ b/services/data-computer/examples.http @@ -123,3 +123,14 @@ X-Webhook-Success: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 X-Webhook-Failure: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 < ./example-json.tar.gz + + +### +# @name v1CorpusSimilarity +POST {{host}}/v1/corpus-similarity HTTP/1.1 +Content-Type: application/x-tar +X-Webhook-Success: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 +X-Webhook-Failure: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 + +< ./example-similarity-json.tar.gz + diff --git a/services/data-computer/package.json b/services/data-computer/package.json index 65c124f5..ac65a822 100644 --- a/services/data-computer/package.json +++ b/services/data-computer/package.json @@ -1,35 +1,35 @@ { - "private": true, - "name": "ws-data-computer", - "version": "2.15.0", - "description": "Calculs sur fichier corpus compressé", - "repository": { - "type": "git", - "url": "git+https://github.com/Inist-CNRS/web-services.git" - }, - "keywords": [ - "ezmaster" - ], - "author": " ", - "license": "MIT", - "bugs": { - "url": "https://github.com/Inist-CNRS/web-services/issues" - }, - "homepage": "https://github.com/Inist-CNRS/web-services/#readme", - "scripts": { - "version:insert:readme": "sed -i \"s#\\(${npm_package_name}.\\)\\([\\.a-z0-9]\\+\\)#\\1${npm_package_version}#g\" README.md && git add README.md", - "version:insert:swagger": "sed -i \"s/\\\"version\\\": \\\"[0-9]\\+.[0-9]\\+.[0-9]\\+\\\"/\\\"version\\\": \\\"${npm_package_version}\\\"/g\" swagger.json && git add swagger.json", - "version:insert": "npm run version:insert:readme && npm run version:insert:swagger", - "version:commit": "git commit -a -m \"release ${npm_package_name}@${npm_package_version}\"", - "version:tag": "git tag \"${npm_package_name}@${npm_package_version}\" -m \"${npm_package_name}@${npm_package_version}\"", - "version:push": "git push && git push --tags", - "version": "npm run version:insert && npm run version:commit && npm run version:tag", - "postversion": "npm run version:push", - "build:dev": "docker build -t cnrsinist/${npm_package_name}:latest .", - "start:dev": "npm run build:dev && docker run --name dev --rm --detach -p 31976:31976 cnrsinist/${npm_package_name}:latest", - "stop:dev": "docker stop dev", - "build": "docker build -t cnrsinist/${npm_package_name}:${npm_package_version} .", - "start": "docker run --rm -p 31976:31976 cnrsinist/${npm_package_name}:${npm_package_version}", - "publish": "docker push cnrsinist/${npm_package_name}:${npm_package_version}" - } + "private": true, + "name": "ws-data-computer", + "version": "2.16.0", + "description": "Calculs sur fichier corpus compressé", + "repository": { + "type": "git", + "url": "git+https://github.com/Inist-CNRS/web-services.git" + }, + "keywords": [ + "ezmaster" + ], + "author": " ", + "license": "MIT", + "bugs": { + "url": "https://github.com/Inist-CNRS/web-services/issues" + }, + "homepage": "https://github.com/Inist-CNRS/web-services/#readme", + "scripts": { + "version:insert:readme": "sed -i \"s#\\(${npm_package_name}.\\)\\([\\.a-z0-9]\\+\\)#\\1${npm_package_version}#g\" README.md && git add README.md", + "version:insert:swagger": "sed -i \"s/\\\"version\\\": \\\"[0-9]\\+.[0-9]\\+.[0-9]\\+\\\"/\\\"version\\\": \\\"${npm_package_version}\\\"/g\" swagger.json && git add swagger.json", + "version:insert": "npm run version:insert:readme && npm run version:insert:swagger", + "version:commit": "git commit -a -m \"release ${npm_package_name}@${npm_package_version}\"", + "version:tag": "git tag \"${npm_package_name}@${npm_package_version}\" -m \"${npm_package_name}@${npm_package_version}\"", + "version:push": "git push && git push --tags", + "version": "npm run version:insert && npm run version:commit && npm run version:tag", + "postversion": "npm run version:push", + "build:dev": "docker build -t cnrsinist/${npm_package_name}:latest .", + "start:dev": "npm run build:dev && docker run --name dev --rm --detach -p 31976:31976 cnrsinist/${npm_package_name}:latest", + "stop:dev": "docker stop dev", + "build": "docker build -t cnrsinist/${npm_package_name}:${npm_package_version} .", + "start": "docker run --rm -p 31976:31976 cnrsinist/${npm_package_name}:${npm_package_version}", + "publish": "docker push cnrsinist/${npm_package_name}:${npm_package_version}" + } } diff --git a/services/data-computer/swagger.json b/services/data-computer/swagger.json index 910be1e6..f5962c80 100644 --- a/services/data-computer/swagger.json +++ b/services/data-computer/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "data-computer - Calculs sur fichier corpus compressé", "summary": "Calculs sur un corpus compressé", - "version": "2.15.0", + "version": "2.16.0", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", @@ -15,7 +15,7 @@ "x-comment": "Will be automatically completed by the ezs server." }, { - "url": "http://vptdmjobs.intra.inist.fr:49191/", + "url": "http://vptdmjobs.intra.inist.fr:49196/", "description": "Latest version for production", "x-profil": "Standard" } @@ -30,4 +30,4 @@ } } ] -} +} \ No newline at end of file diff --git a/services/data-computer/tests.hurl b/services/data-computer/tests.hurl index 56aa93e4..29416a7b 100644 --- a/services/data-computer/tests.hurl +++ b/services/data-computer/tests.hurl @@ -72,8 +72,92 @@ delay: 2000 HTTP 200 [{"id":"#1","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"a"}},{"id":"#2","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"b"}},{"id":"#3","value":{"sample":1,"frequency":0.3333333333333333,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"c"}},{"id":"#4","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"a"}},{"id":"#5","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"b"}}] -# -# group +################################ Test for Similarity ################################ + +POST {{host}}/v1/corpus-similarity +content-type: application/x-tar +x-hook: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 +file,example-similarity-json.tar.gz; + +HTTP 200 +# Capture the computing token +[Captures] +computing_token: jsonpath "$[0].value" +[Asserts] +variable "computing_token" exists + +# There should be a waiting time, representing the time taken to process data. +# Fortunately, as the data is sparse, and the computing time is small, +# the need is small. + +# Version 4.1.0 of hurl added a delay option, which value is milliseconds. +# https://hurl.dev/blog/2023/09/24/announcing-hurl-4.1.0.html#add-delay-between-requests + +POST {{host}}/v1/retrieve-json?indent=true +content-type: application/json +[Options] +delay: 1000 +``` +[ + { + "value":"{{computing_token}}" + } +] +``` + +HTTP 200 +[{ + "id": "Titre 1", + "value": { + "similarity": [ + "Titre 4", + "Titre 2" + ], + "score": [ + 0.9411764705882353, + 0.9349112426035503 + ] + } +}, +{ + "id": "Titre 2", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9349112426035503 + ] + } +}, +{ + "id": "Titre 3", + "value": { + "similarity": [ + "Titre 4" + ], + "score": [ + 0.8888888888888888 + ] + } +}, +{ + "id": "Titre 4", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9411764705882353 + ] + } +}] + + +# TODO: ajouter les deux autres routes (v1GraphSegment, v1Lda) +# TODO: ajouter la route rapido + +##################################### group-by ###################### POST {{host}}/v1/group-by content-type: application/gzip x-hook: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 @@ -109,4 +193,3 @@ HTTP 200 [{"id":"#1","value":["#1","#4"]},{"id":"#4","value":["#1","#4"]},{"id":"#2","value":["#2","#5"]},{"id":"#5","value":["#2","#5"]},{"id":"#3","value":["#3"]}] # -# TODO: ajouter les deux autres routes (v1GraphSegment, v1Lda) diff --git a/services/data-computer/v1/corpus-similarity.ini b/services/data-computer/v1/corpus-similarity.ini new file mode 100644 index 00000000..c50fceeb --- /dev/null +++ b/services/data-computer/v1/corpus-similarity.ini @@ -0,0 +1,65 @@ +# OpenAPI Documentation - JSON format (dot notation) +mimeType = application/json + +post.operationId = post-v1-corpus-similarity +post.description = Web service de calcul de similarité entre documents d un corpus +post.summary = 3 sorties sont disponibles +post.tags.0 = data-computer +post.requestBody.content.application/x-tar.schema.type = string +post.requestBody.content.application/x-tar.schema.format = binary +post.requestBody.required = true +post.responses.default.description = Informations permettant de récupérer les données le moment venu +post.parameters.0.description = Indenter le JSON résultant +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean +post.parameters.1.description = URL pour signaler que le traitement est terminé +post.parameters.1.in = header +post.parameters.1.name = X-Webhook-Success +post.parameters.1.schema.type = string +post.parameters.1.schema.format = uri +post.parameters.1.required = false +post.parameters.2.description = URL pour signaler que le traitement a échoué +post.parameters.2.in = header +post.parameters.2.name = X-Webhook-Failure +post.parameters.2.schema.type = string +post.parameters.2.schema.format = uri +post.parameters.2.required = false + +post.parameters.3.in = query +post.parameters.3.name = output +post.parameters.3.schema.type = int +post.parameters.3.description = Choix du nombre de documents similaires à afficher dans la sortie : 0 pour automatique, 1 pour tout afficher, n'importe quel autre nombre pour afficher au maximum ce nombre d'élements. + + +[env] +path = generator +value = corpus-similarity + +[use] +plugin = basics +plugin = spawn + +# Step 1 (générique): Charger le fichier corpus +[delegate] +file = charger.cfg + +# Step 2 (générique): Traiter de manière asynchrone les items reçus +[fork] +standalone = true +logger = logger.cfg + +# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus +[fork/exec] +# command should be executable ! +command = ./v1/corpus-similarity.py +args = fix('-p') +args = env('output', "0") + +# Step 2.2 (générique): Enregistrer le résultat et signaler que le traitement est fini +[fork/delegate] +file = recorder.cfg + +# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résultat quand il sera prêt +[delegate] +file = recipient.cfg diff --git a/services/data-computer/v1/corpus-similarity.py b/services/data-computer/v1/corpus-similarity.py new file mode 100755 index 00000000..c5dc00cd --- /dev/null +++ b/services/data-computer/v1/corpus-similarity.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import sys +from difflib import SequenceMatcher +import numpy as np + +def get_ratio(data): + currentTitle = data['value'] + currentId = data['id'] + idList = [] + ratioList = [] + + for _,line_cmp in enumerate(all_data): + data_cmp = line_cmp[0] + id,title = data_cmp["id"],data_cmp["value"] + if currentId == id: + continue + ratio = SequenceMatcher(None, currentTitle, title).ratio() + idList.append(id) + ratioList.append(ratio) + + #Sort both lists according to ratioList + ratioList,idList = (list(t) for t in zip(*sorted(zip(ratioList, idList),reverse=True))) + + return currentId, ratioList,idList + +# load all datas +all_data = [] +for line in sys.stdin: + data=json.loads(line) + all_data.append(data) + + +output = int(sys.argv[sys.argv.index('-p') + 1] if '-p' in sys.argv else 0) + +for line in all_data: + id, ratioList, idList = get_ratio(line[0]) + if output == 0: + if ratioList[0] < 0.6: + sim = [] + score = [] + else: + diff = -np.diff(ratioList) + mean = np.mean(diff) + argmx = np.argmax(diff-mean) + sim = idList[:argmx+1] + score = ratioList[:argmx+1] + elif output == 1: + sim = idList + score = ratioList + else: + sim = idList[:min(len(idList),output)] + score = ratioList[:min(len(idList),output)] + + sys.stdout.write(json.dumps({"id":id,"value":{"similarity":sim, "score":score}})) + sys.stdout.write('\n')