diff --git a/.dockerignore b/.dockerignore
index 0a3bc654..b9084352 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -7,7 +7,6 @@
.venv
.vscode
data
-init-mongo.js
# files and directories in the whole project
**/.gitignore
diff --git a/.env b/.env
index cff01ddb..9358193e 100644
--- a/.env
+++ b/.env
@@ -12,9 +12,6 @@ LOAD_BALANCER_CONFIG_FILE_PATH=src/node_config.docker.yml
# shared worker variables
AUTHORIZATION_SECRET=YWVuaXF1YWRpNWNlaXJpNmFlbTZkb283dXphaVF1b29oM3J1MWNoYWlyNHRoZWUzb2huZ2FpM211bGVlM0VpcAo=
-DATABASE_HOST=database
-DATABASE_PORT=27017
-DATABASE_NAME=athene_db
BALANCER_QUEUE_FREQUENCY=600
BALANCER_GETTASK_URL=http://load-balancer:8000/getTask
BALANCER_SENDRESULT_URL=http://load-balancer:8000/sendTaskResult
@@ -23,18 +20,4 @@ BALANCER_SENDRESULT_URL=http://load-balancer:8000/sendTaskResult
# embedding variables
EMBEDDING_CLOUD_CONFIG_PATH=./embedding/src/cloud/config.py
-EMBEDDING_DATABASE_USER=embedding
-EMBEDDING_DATABASE_PWD=embedding_password
EMBEDDING_CHUNK_SIZE=50
-
-# clustering variables
-CLUSTERING_DATABASE_USER=embedding
-CLUSTERING_DATABASE_PWD=embedding_password
-
-# tracking variables
-TRACKING_DATABASE_USER=tracking
-TRACKING_DATABASE_PWD=tracking_password
-
-# database variables
-DATABASE_ROOT_USERNAME=root
-DATABASE_ROOT_PASSWORD=root_password
diff --git a/.github/workflows/dockerimage.yml b/.github/workflows/dockerimage.yml
index d2adcd43..6c10efd3 100644
--- a/.github/workflows/dockerimage.yml
+++ b/.github/workflows/dockerimage.yml
@@ -91,22 +91,3 @@ jobs:
run: ./.github/workflows/scripts/dockerimage.sh "clustering"
- name: Run unittests for clustering-component
run: docker run -i --rm --entrypoint python ghcr.io/ls1intum/athena/clustering:${GITHUB_REF##*/} -m unittest discover -p test_*.py
-
- athene-tracking:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v2
- - name: Login to GitHub Container Registry
- uses: docker/login-action@v2
- with:
- registry: ghcr.io
- username: ${GITHUB_ACTOR}
- password: ${{ secrets.GITHUB_TOKEN }}
- - name: Login to Docker Hub
- uses: docker/login-action@v2
- with:
- username: ${{ secrets.DOCKERHUB_USER }}
- password: ${{ secrets.DOCKERHUB_TOKEN }}
- if: github.ref == 'refs/heads/master'
- - name: Build and Push the athene-tracking Docker image
- run: ./.github/workflows/scripts/dockerimage.sh "tracking"
diff --git a/.github/workflows/scripts/dockerimage.sh b/.github/workflows/scripts/dockerimage.sh
index 227473cc..8f5d4f57 100755
--- a/.github/workflows/scripts/dockerimage.sh
+++ b/.github/workflows/scripts/dockerimage.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-COMPONENT=$1 # Parameter $1 (Component): either "load-balancer", "segmentation", "embedding", "clustering" or "tracking"
+COMPONENT=$1 # Parameter $1 (Component): either "load-balancer", "segmentation", "embedding" or "clustering"
echo -e "INFO: Building ${COMPONENT}-component"
diff --git a/.local.env b/.local.env
index b2004ab2..155dae88 100644
--- a/.local.env
+++ b/.local.env
@@ -11,9 +11,6 @@ LOAD_BALANCER_CONFIG_FILE_PATH=src/node_config.local.yml
# shared worker variables
AUTHORIZATION_SECRET=YWVuaXF1YWRpNWNlaXJpNmFlbTZkb283dXphaVF1b29oM3J1MWNoYWlyNHRoZWUzb2huZ2FpM211bGVlM0VpcAo=
-DATABASE_HOST=database
-DATABASE_PORT=27017
-DATABASE_NAME=athene_db
BALANCER_QUEUE_FREQUENCY=600
BALANCER_GETTASK_URL=http://localhost:8000/getTask
BALANCER_SENDRESULT_URL=http://localhost:8000/sendTaskResult
@@ -21,18 +18,4 @@ BALANCER_SENDRESULT_URL=http://localhost:8000/sendTaskResult
# segmentation variables
# embedding variables
-EMBEDDING_DATABASE_USER=embedding
-EMBEDDING_DATABASE_PWD=embedding_password
EMBEDDING_CHUNK_SIZE=50
-
-# clustering variables
-CLUSTERING_DATABASE_USER=embedding
-CLUSTERING_DATABASE_PWD=embedding_password
-
-# tracking variables
-TRACKING_DATABASE_USER=tracking
-TRACKING_DATABASE_PWD=tracking_password
-
-# database variables
-DATABASE_ROOT_USERNAME=root
-DATABASE_ROOT_PASSWORD=root_password
diff --git a/.run/All Services.run.xml b/.run/All Services.run.xml
index b068262e..c30f9176 100644
--- a/.run/All Services.run.xml
+++ b/.run/All Services.run.xml
@@ -5,7 +5,6 @@
-
\ No newline at end of file
diff --git a/.run/Traefik and DB.run.xml b/.run/Traefik.run.xml
similarity index 64%
rename from .run/Traefik and DB.run.xml
rename to .run/Traefik.run.xml
index 2bc70111..03ceb21a 100644
--- a/.run/Traefik and DB.run.xml
+++ b/.run/Traefik.run.xml
@@ -1,9 +1,9 @@
-
+
-
+
diff --git a/.run/tracking.run.xml b/.run/tracking.run.xml
deleted file mode 100644
index fd0aa6d2..00000000
--- a/.run/tracking.run.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 9f7241e7..388945f4 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ export
all: | setup start
-setup: .venv setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation setup-tracking
+setup: .venv setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation
.venv:
python -m venv .venv
@@ -29,10 +29,6 @@ setup-segmentation:
$(info Building segmentation)
cd segmentation && $(MAKE)
-setup-tracking:
- $(info Building tracking)
- cd tracking && $(MAKE)
-
start-clustering: setup-clustering
$(info Starting clustering)
$(MAKE) -C clustering start
@@ -49,17 +45,13 @@ start-segmentation: setup-segmentation
$(info Starting segmentation)
$(MAKE) -C segmentation start
-start-tracking: setup-tracking
- $(info Starting tracking)
- $(MAKE) -C tracking start
-
-start-traefik-db:
- $(info Starting traefik and db)
- docker-compose -f docker-compose-traefik-db.yml up
+start-traefik:
+ $(info Starting traefik)
+ docker-compose -f docker-compose-traefik.yml up
start:
$(info Starting all services)
- $(MAKE) -j6 start-clustering start-embedding start-load-balancer start-segmentation start-tracking start-traefik-db
+ $(MAKE) -j6 start-clustering start-embedding start-load-balancer start-segmentation start-traefik
clean:
rm -rf .venv
@@ -68,6 +60,5 @@ clean:
cd embedding && $(MAKE) clean
cd load-balancer && $(MAKE) clean
cd segmentation && $(MAKE) clean
- cd tracking && $(MAKE) clean
-.PHONY: all setup setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation setup-tracking start-clustering start-embedding start-load-balancer start-segmentation start-tracking start-traefik-db start clean
\ No newline at end of file
+.PHONY: all setup setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation start-clustering start-embedding start-load-balancer start-segmentation start-traefik start clean
\ No newline at end of file
diff --git a/README.md b/README.md
index 34906b07..45585e33 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ which initializes virtual environments,
installs dependencies and downloads required models.
After that the services will be started automatically.
-There is one special target in the `Makefile` that will start traefik and the MongoDB database in a docker container
+There is one special target in the `Makefile` that will start traefik in a docker container
to redirect to the services running on your local machine.
You can always just directly use `make` and it will automatically detect changed dependencies.
@@ -87,7 +87,6 @@ If you are using PyCharm, you can configure the project as follows:
\- `embedding`
\- `load-balancer`
\- `segmentation`
- \- `tracking`
5. Configure the virtual environment Python interpreters for the different modules: For each of the modules in the list above, go to `File -> Settings -> Project: Athena -> Project Interpreter` and select the virtual environment in the `.venv` directory of the respective module.
@@ -115,8 +114,6 @@ The following API-routes are available after start:
* - For the computation components to query tasks from the load balancer
* - For the computation components to send back results to the load balancer
* - For Artemis to upload course material
-* - For Artemis to access tracking functionality
-* - For Artemis to access feedback\_consistency functionality
Traefik provides a dashboard to monitor the status of underlying components.
This dashboard is available on by default.
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
deleted file mode 100644
index 495a63a9..00000000
--- a/benchmark/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-enum34==1.1.6
-ipaddr==2.2.0
-ipaddress==1.0.17
-requests==2.31.0
-urllib3==1.26.18
-virtualenv==15.1.0
-websocket-client==0.44.0
-
-pyocclient==0.4
-
-nltk==3.6.6
diff --git a/benchmark/src/__init__.py b/benchmark/src/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmark/src/data/__init__.py b/benchmark/src/data/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmark/src/data/data_retriever.py b/benchmark/src/data/data_retriever.py
deleted file mode 100644
index 34c525f5..00000000
--- a/benchmark/src/data/data_retriever.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import pandas as pd
-from pandas import read_csv
-from pathlib import Path
-from benchmark.src.entities.text_block import TextBlock
-from benchmark.src.entities.feedback_with_text_block import FeedbackWithTextBlock
-import itertools
-
-__cwd = Path.cwd()
-PATH_LABELED_SUBMISSIONS = (__cwd / "data/resources/text_block.csv").resolve()
-PATH_TEXT_BLOCKS = (__cwd / "data/resources/ArTEMiS_text_block.csv").resolve()
-PATH_FEEDBACK = (__cwd / "data/resources/ArTEMiS_feedback.csv").resolve()
-PATH_FEEDBACK_CONSISTENCY = (__cwd / "data/resources/feedback.csv").resolve()
-PATH_FEEDBACK_CONSISTENCY_OUTPUT = (__cwd / "data/resources/feedback_inconsistencies.csv").resolve()
-
-
-def read_labeled_sentences_from_csv(num_sentences=None):
- submissions = read_csv(PATH_LABELED_SUBMISSIONS)
- submissions = submissions[~submissions["manual_cluster_id"].isnull()]
- sentences = submissions[["text"]].values.flatten()
- ground_truth_clusters = submissions[["manual_cluster_id"]].values.flatten()
- ids = submissions[["id"]].values.flatten()
- if num_sentences is None:
- num_sentences = len(sentences)
- else:
- num_sentences = min(num_sentences, len(sentences))
- return [TextBlock(sentences[i], ground_truth_cluster=ground_truth_clusters[i], id=ids[i]) for i in
- range(num_sentences)]
-
-
-def read_sentences_feedback_from_csv(num_sentences=None):
- text_blocks_csv = read_csv(PATH_TEXT_BLOCKS)
- feedback_csv = read_csv(PATH_FEEDBACK)
- result = pd.merge(text_blocks_csv, feedback_csv, left_on="id", right_on="reference")
- result = result[~result["points"].isnull()]
- result = result[~result["text"].isnull()]
- ids = result[["id"]].values.flatten()
- text_blocks = result[["text"]].values.flatten()
- points = result[["points"]].values.flatten()
- if num_sentences is None:
- num_sentences = len(text_blocks)
- else:
- num_sentences = min(num_sentences, len(text_blocks))
- return [TextBlock(text_blocks[i], ground_truth_grade=points[i], id=ids[i]) for i in range(num_sentences)]
-
-
-def read_feedback_consistency_from_csv():
- data = read_csv(PATH_FEEDBACK_CONSISTENCY, sep=";", keep_default_na=False)
- feedback_ids = data[["feedback_id"]].values.flatten()
- feedback_texts = data[["feedback_text"]].values.flatten()
- feedback_scores = data[["score"]].values.flatten()
- references = data[["reference"]].values.flatten()
- ids = data[["textblock_id"]].values.flatten()
- texts = data[["textblock_text"]].values.flatten()
- submission_ids = data[["submission_id"]].values.flatten()
- cluster_ids = data[["cluster_id"]].values.flatten()
- blocks = [FeedbackWithTextBlock(textblock_id=ids[i], submission_id=submission_ids[i], cluster_id=cluster_ids[i],
- text=texts[i], feedback_id=feedback_ids[i], feedback_score=feedback_scores[i],
- feedback_text=feedback_texts[i], reference=references[i]) for i in
- range(len(data)) if feedback_texts[i] and cluster_ids[i] and texts[i] and not feedback_texts[i] == ' ']
- return [list(i) for j, i in
- itertools.groupby(sorted(blocks, key=lambda x: x.submission_id), lambda x: x.submission_id)]
-
-
-def write_feedback_inconsistencies_to_csv(inconsistencies):
- df = pd.DataFrame(list(itertools.chain.from_iterable(inconsistencies)),
- columns=['firstFeedbackId', 'secondFeedbackId', 'type'])
- df.to_csv(PATH_FEEDBACK_CONSISTENCY_OUTPUT, index=False, header=True)
diff --git a/benchmark/src/entities/__init__.py b/benchmark/src/entities/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmark/src/entities/cluster.py b/benchmark/src/entities/cluster.py
deleted file mode 100644
index 9c2b819e..00000000
--- a/benchmark/src/entities/cluster.py
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-class Cluster:
-
- def __init__(self, id, block_ids, probabilities, distances):
- self.id = id
- self.block_ids = block_ids
- self.probabilities = probabilities
- self.distances = distances
-
- def contains_block(self, block_id):
- return block_id in self.block_ids
-
- def probability_of_block(self, block_id):
- if not self.contains_block(block_id):
- return 0.0
- else:
- index = self.block_ids.index(block_id)
- return self.probabilities[index]
-
- @staticmethod
- def clusters_from_network_response(response):
- clusters = []
- for id, cluster_data in response.items():
- block_ids = [block["id"] for block in cluster_data["blocks"]]
- clusters.append(Cluster(id, block_ids, cluster_data["probabilities"], cluster_data["distanceMatrix"]))
- return clusters
-
- def __str__(self):
- return "Cluster {} with blocks {}".format(self.id, self.block_ids)
-
-
diff --git a/benchmark/src/entities/feedback_with_text_block.py b/benchmark/src/entities/feedback_with_text_block.py
deleted file mode 100644
index 9936b613..00000000
--- a/benchmark/src/entities/feedback_with_text_block.py
+++ /dev/null
@@ -1,23 +0,0 @@
-
-class FeedbackWithTextBlock:
-
- def __init__(self, feedback_id: int, feedback_text: str, feedback_score: float, reference: str, textblock_id: str, text: str,
- submission_id: int, cluster_id: int):
- self.id = textblock_id
- self.submission_id = submission_id
- self.cluster_id = cluster_id
- self.text = text
- self.feedback_id = feedback_id
- self.feedback_text = feedback_text
- self.feedback_score = feedback_score
- self.reference = reference
-
- def json_rep(self):
- return {
- 'textBlockId': self.id,
- 'clusterId': str(self.cluster_id),
- 'text': self.text,
- 'feedbackId': str(self.feedback_id),
- 'feedbackText': self.feedback_text,
- 'credits': self.feedback_score
- }
diff --git a/benchmark/src/entities/text_block.py b/benchmark/src/entities/text_block.py
deleted file mode 100644
index 40dd9899..00000000
--- a/benchmark/src/entities/text_block.py
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-class TextBlock:
- __last_id = 0
-
- def __init__(self, text, id=None, ground_truth_cluster=0, ground_truth_grade=None):
- self.text = text
- self.original_text = text
- self.ground_truth_cluster = ground_truth_cluster
- self.ground_truth_grade = ground_truth_grade
- self.cluster = 0
- self.embedding = None
- self.grade_from_cluster = None
- self.probability_in_cluster = None
- if id is None:
- TextBlock.__last_id = TextBlock.__last_id + 1
- self.id = TextBlock.__last_id
- else:
- self.id = id
- TextBlock.__last_id = id
-
- def __str__(self):
- self.text.__str__()
-
- def json_rep(self):
- return {
- 'id': self.id,
- 'text': self.text
- }
-
- def extract_cluster(self, clusters: list):
- self.cluster = [cluster for cluster in clusters if cluster.contains_block(self.id)][0]
- self.probability_in_cluster = self.cluster.probability_of_block(self.id)
-
- def extract_embedding(self, embeddings: list):
- self.embedding = [embedding['vector'] for embedding in embeddings if embedding['id'] == self.id][0]
-
- def similar(self, other):
- return self.cluster.id == other.cluster.id
-
- def ground_truth_similar(self, other):
- return self.ground_truth_cluster == other.ground_truth_cluster
-
- def compute_grade_from_cluster(self, text_blocks):
- if self.cluster is None:
- raise Exception("cluster for text block {} not defined".format(self.id))
- cluster_grades = [block.ground_truth_grade for block in text_blocks if self.similar(block)]
- self.grade_from_cluster = sum(cluster_grades) / float(len(cluster_grades))
-
- @staticmethod
- def from_sentences(sentences):
- return [TextBlock(sentence) for sentence in sentences]
diff --git a/benchmark/src/main.py b/benchmark/src/main.py
deleted file mode 100644
index 8335d0f4..00000000
--- a/benchmark/src/main.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import logging
-import sys
-import matplotlib.pyplot as plt
-from benchmark.src.data.data_retriever import read_labeled_sentences_from_csv, read_sentences_feedback_from_csv, \
- read_feedback_consistency_from_csv, write_feedback_inconsistencies_to_csv
-from benchmark.src.entities.cluster import Cluster
-from benchmark.src.entities.text_block import TextBlock
-from benchmark.src.networking.api_services import *
-from benchmark.src.plotting import plot_embeddings
-from benchmark.src.similarity_measure import PrecisionRecallSimilarity, GradeBasedSimilarity
-
-__logger = getLogger(__name__)
-
-
-def process_text_blocks(text_blocks, courseId=None, plot=True, log_clusters=False):
- embeddings = embed(text_blocks, courseId=courseId)
- clusters = Cluster.clusters_from_network_response(cluster(embeddings))
- for text_block in text_blocks:
- text_block.extract_cluster(clusters)
- text_block.extract_embedding(embeddings)
- if plot:
- plot_embeddings(text_blocks)
- if log_clusters:
- cluster_to_text = ["cluster {}: {}".format(textblock.cluster.id, textblock.original_text) for textblock in
- text_blocks]
- cluster_to_text.sort()
- for result in cluster_to_text:
- logger.info(result + "\n")
- return text_blocks
-
-
-def evaluate_by_labeled_sentences(courseId=None):
- text_blocks = read_labeled_sentences_from_csv()
- text_blocks = process_text_blocks(text_blocks, courseId)
- similarity_measure = PrecisionRecallSimilarity(text_blocks)
- __logger.info("similarity labeled data for course {}".format(courseId))
- similarity_measure.output_results()
-
-
-def evaluate_by_artemis_data(courseId=None):
- text_blocks = read_sentences_feedback_from_csv(num_sentences=1000)
- text_blocks = process_text_blocks(text_blocks, courseId)
- similarity_measure = GradeBasedSimilarity(text_blocks)
- __logger.info("similarity grade-based for course {}".format(courseId))
- similarity_measure.output_results()
-
-
-def plot_sentences(sentences, courseId=None):
- text_blocks = [TextBlock(sentence) for sentence in sentences]
- process_text_blocks(text_blocks, courseId, plot=True)
-
-
-def feedback_consistency_test(exercise_id):
- data = read_feedback_consistency_from_csv()
- inconsistencies = check_feedback_consistency(feedback_with_text_blocks=data, exercise_id=exercise_id)
- write_feedback_inconsistencies_to_csv(inconsistencies)
-
-
-if __name__ == "__main__":
- logger = logging.getLogger()
- logger.setLevel(logging.DEBUG)
-
- handler = logging.StreamHandler(sys.stdout)
- handler.setLevel(logging.DEBUG)
- formatter = logging.Formatter('[%(asctime)s] [%(process)d] [%(levelname)s] [%(name)s] %(message)s')
- handler.setFormatter(formatter)
- logger.addHandler(handler)
-
- sentences = [
- "class diagram depicts the structure of the system",
- "class diagram is a system model",
- "one of the system models is a class diagram",
- "the structure of the system are represented in a class diagram",
- "class diagrams contain classes and relations between them ",
- "class diagram is a UML model",
- "a diagram was presented in class",
- "we didn't deal with diagrams in class ",
- "Diagrams are part of this class",
- "This is a first class flight",
- "there are different classes of diagrams",
- "I booked first class seat on the train",
- ]
-
- feedback_consistency_test('1')
-
- evaluate_by_labeled_sentences(1478643)
- evaluate_by_labeled_sentences(81)
- evaluate_by_labeled_sentences()
-
- plt.show()
diff --git a/benchmark/src/networking/__init__.py b/benchmark/src/networking/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmark/src/networking/api_requests.py b/benchmark/src/networking/api_requests.py
deleted file mode 100644
index 8ae2c752..00000000
--- a/benchmark/src/networking/api_requests.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from logging import getLogger
-
-import requests
-
-__logger = getLogger(__name__)
-
-
-def post(api_endpoint, data):
- response = requests.post(url=api_endpoint, json=data)
-
- if not response:
- __logger.error("POST failed on {}: Status Code: {}".format(api_endpoint, response.status_code))
- return None
-
- return response.json() if response.status_code != 204 else None
diff --git a/benchmark/src/networking/api_services.py b/benchmark/src/networking/api_services.py
deleted file mode 100644
index 2d371a76..00000000
--- a/benchmark/src/networking/api_services.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from logging import getLogger
-from benchmark.src.networking.api_requests import post
-import numpy as np
-
-__logger = getLogger(__name__)
-
-SEGMENTATION_URL = "http://localhost:8000/segment"
-EMBEDDING_URL = "http://localhost:8001/embed"
-CLUSTERING_URL = "http://localhost:8002/cluster"
-FEEDBACK_CONSISTENCY_URL = "http://localhost:8001/feedback_consistency"
-
-
-def segment(submissions, keywords=None):
- # request with {"submissions":[{id:,text:}],"keywords":[]}
- # response with {"keywords":[],"textBlocks":[{id:,startIndex:,endIndex}]}
- request = {"submissions": submissions}
- if keywords is not None:
- request["keywords"] = keywords
- return post(SEGMENTATION_URL, request)
-
-
-def __check_feedback_consistency(feedback_with_text_block, exerciseId):
- # request with {"feedbackWithTextBlock":[{'textBlockId':,'clusterId':,'text':,'feedbackId':,'feedbackText':,'credits':}]}
- # {"feedbackInconsistencies":[{'firstFeedbackId':,'secondFeedbackId':,'type':]}
- request = {"feedbackWithTextBlock": feedback_with_text_block, "exerciseId": exerciseId}
- return post(FEEDBACK_CONSISTENCY_URL, request)
-
-
-def check_feedback_consistency(feedback_with_text_blocks, exercise_id):
- inconsistencies = []
- for fwt in feedback_with_text_blocks:
- feedback_with_text_block = [block.json_rep() for block in fwt]
- response = __check_feedback_consistency(feedback_with_text_block, exercise_id)
- if response['feedbackInconsistencies']:
- inconsistencies.append(response['feedbackInconsistencies'])
- return np.array(inconsistencies).flatten().tolist()
-
-
-def __embed(text_blocks, courseId=None):
- # request with {'courseId': 25, 'blocks': [{'id': 1, 'text': 'this is the first block'}, {'id': 2, 'text': 'this is the second block'}]}
- # response with { 'embeddings': [{'id': , 'vector':[]}] }
- request = {"blocks": [text_block.json_rep() for text_block in text_blocks]}
- if courseId is not None:
- request["courseId"] = courseId
- return post(EMBEDDING_URL, request)['embeddings']
-
-
-def embed(text_blocks, courseId=None):
- split_text_blocks = [text_blocks]
- if len(text_blocks) > 50:
- split_text_blocks = np.array_split(np.array(text_blocks), len(text_blocks) / 50)
- embeddings = list(map(lambda blocks: __embed(blocks, courseId), split_text_blocks))
- return [embedding for embedding_list in embeddings for embedding in embedding_list]
-
-
-def cluster(embeddings):
- # request with { "embeddings": [{"id": ,"vector":[]}] }
- # response with {"clusters": {"-1": {"blocks": [{"id": 1}, {"id": 2}], "probabilities": [0.0, 0.0], "distanceMatrix": [[0.0, 0.22923004776660816], [0.22923004776660816, 0.0]]}}}
- request = {"embeddings": embeddings}
- return post(CLUSTERING_URL, request)['clusters']
diff --git a/benchmark/src/plotting.py b/benchmark/src/plotting.py
deleted file mode 100644
index 9477286a..00000000
--- a/benchmark/src/plotting.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import matplotlib.pyplot as plt
-import seaborn as sns
-from sklearn.manifold import TSNE
-
-from benchmark.src.entities.text_block import TextBlock
-
-tsne = TSNE(n_components=2, random_state=0)
-
-
-def reduce_dimensions(vectors):
- return tsne.fit_transform(vectors)
-
-
-def plot_embeddings(textblocks: [TextBlock], persist_labels=False):
- textblocks = [textblock for textblock in textblocks if int(textblock.cluster.id) >= -1]
- vectors = [textblock.embedding for textblock in textblocks]
- texts = [textblock.original_text for textblock in textblocks]
- clusters = [int(textblock.cluster.id) for textblock in textblocks]
- # clusters = [int(textblock.ground_truth_cluster) for textblock in textblocks]
- probabilities = [textblock.probability_in_cluster for textblock in textblocks]
- vectors = reduce_dimensions(vectors)
-
- color_palette = sns.color_palette('deep', max(clusters) + 1)
- cluster_colors = [color_palette[x] if x >= 0
- else (0.5, 0.5, 0.5)
- for x in clusters]
- cluster_member_colors = [sns.desaturate(x, p) for x, p in
- zip(cluster_colors, probabilities)]
-
- x = vectors[:, 0]
- y = vectors[:, 1]
- labels = texts
- colors = cluster_member_colors
-
- norm = plt.Normalize(1, 4)
- fig, ax = plt.subplots()
- sc = plt.scatter(x, y, c=colors, s=100, norm=norm)
- # plt.xlim(-200, 250)
- # plt.xlim(-200, 250)
-
- if persist_labels :
- for i in range(len(x)):
- annotation = ax.annotate("", xy=(x[i], y[i]), xytext=(20, 20), textcoords="offset points",
- bbox=dict(boxstyle="round", fc="w"),
- arrowprops=dict(arrowstyle="->"))
- annotation.set_text(texts[i])
- annotation.get_bbox_patch().set_alpha(0.4)
- annotation.set_visible(True)
- else:
- annotation = ax.annotate("", xy=(0, 0), xytext=(20, 20), textcoords="offset points",
- bbox=dict(boxstyle="round", fc="w"),
- arrowprops=dict(arrowstyle="->"))
- annotation.set_visible(False)
-
- def update_annot(ind):
- pos = sc.get_offsets()[ind["ind"][0]]
- annotation.xy = pos
- text = "{}".format(" ".join([labels[n] for n in ind["ind"]]))
- annotation.set_text(text)
- annotation.get_bbox_patch().set_alpha(0.4)
-
- def hover(event):
- vis = annotation.get_visible()
- if event.inaxes == ax:
- cont, ind = sc.contains(event)
- if cont:
- update_annot(ind)
- annotation.set_visible(True)
- fig.canvas.draw_idle()
- else:
- if vis:
- annotation.set_visible(False)
- fig.canvas.draw_idle()
- fig.canvas.mpl_connect("motion_notify_event", hover)
diff --git a/benchmark/src/similarity_measure.py b/benchmark/src/similarity_measure.py
deleted file mode 100644
index a7354bf6..00000000
--- a/benchmark/src/similarity_measure.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from abc import ABC, abstractmethod
-from logging import getLogger
-
-
-class SimilarityMeasure(ABC):
- @abstractmethod
- def output_results(self):
- pass
-
-
-class PrecisionRecallSimilarity(SimilarityMeasure):
- __logger = getLogger(__name__)
-
- def __init__(self, text_blocks):
- self.text_blocks = text_blocks
- self.false_negatives = 0
- self.false_positives = 0
- self.true_negatives = 0
- self.true_positives = 0
-
- for text_block in text_blocks:
- for other in text_blocks:
- if text_block.similar(other) and text_block.ground_truth_similar(other):
- self.true_positives += 1
- if not (text_block.similar(other)) and not (text_block.ground_truth_similar(other)):
- self.true_negatives += 1
- if text_block.similar(other) and not (text_block.ground_truth_similar(other)):
- self.false_positives += 1
- if not (text_block.similar(other)) and text_block.ground_truth_similar(other):
- self.false_negatives += 1
-
- self.precision = self.true_positives / (1.0 * (self.true_positives + self.false_positives))
- self.recall = self.true_positives / (1.0 * (self.true_positives + self.false_negatives))
- self.f1_score = 2 * ((self.precision * self.recall) / (self.precision + self.recall))
-
- def output_results(self):
- self.__logger.info('The achieved precision is {}'.format(self.precision))
- self.__logger.info('The achieved recall is {}'.format(self.recall))
- self.__logger.info('The achieved F1_score is {}'.format(self.f1_score))
-
-
-class GradeBasedSimilarity(SimilarityMeasure):
- __logger = getLogger(__name__)
-
- def __init__(self, text_blocks):
- for text_block in text_blocks:
- text_block.compute_grade_from_cluster(text_blocks)
- self.text_blocks = text_blocks
- self.l1_loss = sum(
- [abs((text_block.grade_from_cluster - text_block.ground_truth_grade)) for text_block in text_blocks]) / \
- len(text_blocks)
-
- def output_results(self):
- self.__logger.info('The L1 loss for the model is {}'.format(self.l1_loss))
- max_over_graded = max(self.text_blocks,
- key=lambda text_block: text_block.grade_from_cluster - text_block.ground_truth_grade)
- self.__logger.info(
- "The most over-graded sentence is \"{}\". \n Assigned:{} but ground truth: {}".format(
- max_over_graded.original_text,
- max_over_graded.grade_from_cluster,
- max_over_graded.ground_truth_grade))
- max_under_graded = max(self.text_blocks,
- key=lambda text_block: text_block.ground_truth_grade - text_block.grade_from_cluster)
- self.__logger.info(
- "The most under-graded sentence is \"{}\". \n Assigned:{} but ground truth: {}".format(
- max_under_graded.original_text,
- max_under_graded.grade_from_cluster,
- max_under_graded.ground_truth_grade))
diff --git a/benchmark/src/test/__init__.py b/benchmark/src/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmark/src/test/test_clustering.py b/benchmark/src/test/test_clustering.py
deleted file mode 100644
index 7acd0cf8..00000000
--- a/benchmark/src/test/test_clustering.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from unittest import TestCase
-
-from benchmark.src.entities.text_block import TextBlock
-from benchmark.src.networking.api_services import cluster, embed
-
-sentences_flowers = ["A second flower blossomed and remained.",
- "I have red and yellow flowers",
- "flowers and roses are beautiful",
- "She picked the flower up and smelled it"]
-sentences_software = ["this is the clustering component of the text assessment software engineering project",
- "In software engineering, a software design pattern is a general, reusable solution to a commonly occurring problem within a given context in software design.",
- "Patterns in software engineering is a lecture at TUM",
- "Software engineering is defined as a process of analyzing user requirements and then designing, building, and testing software"]
-sentences_law = ["the congress decided against this law",
- "I want to study law and become lawyer",
- "you can't brake the law like this",
- "Law breaking is usually punished with jail"]
-
-embeddings_flowers = embed(TextBlock.from_sentences(sentences_flowers))
-embeddings_software = embed(TextBlock.from_sentences(sentences_software))
-embeddings_law = embed(TextBlock.from_sentences(sentences_law))
-
-
-class TestClustering(TestCase):
-
- def test_cluster_same_sentences(self):
- embeddings_software_repeated = embed(TextBlock.from_sentences([sentences_software[0]] * 5))
- clusters = cluster(embeddings_software_repeated)
- print(clusters)
- self.assertEqual(1, len(clusters))
-
- def test_cluster_similar_sentences(self):
- clusters = cluster(embeddings_flowers)
- self.assertEqual(1, len(clusters))
-
- clusters = cluster(embeddings_software)
- self.assertEqual(1, len(clusters))
-
- clusters = cluster(embeddings_law)
- self.assertEqual(1, len(clusters))
-
- def test_cluster_different_topics(self):
- clusters = cluster(embeddings_flowers+embeddings_software+embeddings_law)
- # test: there are 3 different clusters
- self.assertEqual(3, len(clusters))
diff --git a/clustering/requirements.txt b/clustering/requirements.txt
index 90016305..6c0dd343 100644
--- a/clustering/requirements.txt
+++ b/clustering/requirements.txt
@@ -5,7 +5,6 @@ matplotlib==3.7.0
numpy==1.24.2
pandas==1.5.3
pydantic==1.10.4
-pymongo==4.3.3
requests==2.31.0
scikit-learn==1.2.1
scipy==1.10.0
diff --git a/clustering/src/database/Connection.py b/clustering/src/database/Connection.py
deleted file mode 100644
index 91d1b3d8..00000000
--- a/clustering/src/database/Connection.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import os
-import pymongo
-
-
-# this class contains most of the important collection level pymongo operations but not all of them
-# for the whole list and detailed explanations - https://api.mongodb.com/python/current/api/pymongo/collection.html
-class Connection:
-
- def __init__(self):
- # Get container variables for datbase connection
- dbhost = str(os.environ['DATABASE_HOST']) if "DATABASE_HOST" in os.environ else "database"
- dbport = int(os.environ['DATABASE_PORT']) if "DATABASE_PORT" in os.environ else 27017
- dbname = str(os.environ['DATABASE_NAME']) if "DATABASE_NAME" in os.environ else "athene_db"
- dbuser = str(os.environ['CLUSTERING_DATABASE_USER']) if "CLUSTERING_DATABASE_USER" in os.environ else "clustering"
- dbpwd = str(os.environ['CLUSTERING_DATABASE_PWD']) if "CLUSTERING_DATABASE_PWD" in os.environ else "clustering_password"
- self.client = pymongo.MongoClient(host=dbhost, port=dbport, username=dbuser, password=dbpwd,
- authSource=dbname)
- self.db = self.client[dbname]
- self.collection = None
-
- # inserts one document to a collection
- # collection {string} - collection name to store the document
- # document {field-value pairs} - e.g. {'x': 1, 'y': "apples"}
- def insert_document(self, collection, document):
- try:
- self.collection = self.db[collection]
- self.collection.insert_one(document)
- except Exception as e:
- print(e)
-
- # inserts an array of documents to a collection
- # collection {string} - collection name to store the document
- # document {array} - e.g. [{'x': 1, 'y': "apples"}, {'x': 15, 'y': "oranges", 'z': 40.5}]
- def insert_documents(self, collection, documents: []):
- try:
- self.collection = self.db[collection]
- self.collection.insert_many(documents)
- except Exception as e:
- print(e)
-
- # query database and returns results
- # filter_dict {field-value pairs} - specifies elements which must be present in the resulting set
- # projection {field-value pairs} - list of field names should be included or excluded in the resulting set. e.g. {‘_id’: False} _id values will be excluded in the resulting set
- # skip {int} - number of documents to omit (from the start of the result set) when returning the results
- # limit {int} - max number of results to return
- # max_time_ms {int} - Specifies a time limit for a query operation. If the specified time is exceeded, the operation will be aborted
- def find_documents(self, collection, filter_dict, projection=None, skip=0, limit=0, max_time_ms=None):
- try:
- self.collection = self.db[collection]
- docs = self.collection.find(filter=filter_dict, projection=projection, skip=skip, limit=limit,
- max_time_ms=max_time_ms)
- except Exception as e:
- print(e)
- else:
- return docs
-
- # update a document matching the filter
- # filter_dict {field-value pairs} - find the document to update e.g. {'x': 1}
- # update_dict {field-value pairs} - modifications to apply e.g. {'$set': {'x': 3}}
- # upsert {boolean} - if true performs insert when no documents match the filter
- # Note: For the full list of update parameters https://docs.mongodb.com/manual/reference/operator/update/
- def update_document(self, collection, filter_dict, update_dict, upsert=False):
- try:
- self.collection = self.db[collection]
- result = self.collection.update_one(filter_dict, update_dict, upsert)
- except Exception as e:
- print(e)
- else:
- return result
-
- # updates one or more documents matching the filter
- def update_documents(self, collection, filter_dict, update_dict, upsert=False):
- try:
- self.collection = self.db[collection]
- result = self.collection.update_many(filter_dict, update_dict, upsert)
- except Exception as e:
- print(e)
- else:
- return result
-
- # deletes one document matching the filter
- def delete_document(self, collection, filter_dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.delete_one(filter_dict)
- except Exception as e:
- print(e)
- else:
- return result
-
- # deletes one or more documents matching the filter
- def delete_documents(self, collection, filter_dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.delete_many(filter_dict)
- except Exception as e:
- print(e)
- else:
- return result
-
- # counts the number of documents in collection matching the filter
- def count_documents(self, collection, filter_dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.count_documents(filter_dict)
- except Exception as e:
- print(e)
- else:
- return result
-
- def get_collection_names(self):
- return self.db.collection_names()
diff --git a/clustering/src/database/__init__.py b/clustering/src/database/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/docker-compose-traefik-db.yml b/docker-compose-traefik-db.yml
deleted file mode 100644
index c41bce8f..00000000
--- a/docker-compose-traefik-db.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-version: '3'
-
-# Use this docker file to only start Traefik and the database.
-# You will have to start the microservices manually, see the individual READMEs for details.
-
-services:
- traefik:
- image: traefik:v2.9.6
- container_name: athene-traefik-local
- restart: unless-stopped
- ports:
- - ${TRAEFIK_DASHBOARD_PORT}:8080
- - ${TRAEFIK_HTTP_PORT}:80
- volumes:
- - ./traefik/traefik.local.yml:/etc/traefik/traefik.yml:ro
- - ./traefik/traefik-dynamic.local.yml:/etc/traefik/traefik-dynamic.local.yml:ro
-
- database:
- image: mongo:latest
- container_name: athene-mongodb
- restart: unless-stopped
- expose:
- - 27017
- environment:
- - MONGO_INITDB_DATABASE=${DATABASE_NAME}
- - MONGO_INITDB_ROOT_USERNAME=${DATABASE_ROOT_USERNAME}
- - MONGO_INITDB_ROOT_PASSWORD=${DATABASE_ROOT_PASSWORD}
- volumes:
- - ./init-mongo.js:/docker-entrypoint-initdb.d/init-mongo.js:ro
- - ./data/db:/data/db
diff --git a/docker-compose-traefik.yml b/docker-compose-traefik.yml
new file mode 100644
index 00000000..a7ba494a
--- /dev/null
+++ b/docker-compose-traefik.yml
@@ -0,0 +1,16 @@
+version: '3'
+
+# Use this docker file to only start Traefik.
+# You will have to start the microservices manually, see the individual READMEs for details.
+
+services:
+ traefik:
+ image: traefik:v2.9.6
+ container_name: athene-traefik-local
+ restart: unless-stopped
+ ports:
+ - ${TRAEFIK_DASHBOARD_PORT}:8080
+ - ${TRAEFIK_HTTP_PORT}:80
+ volumes:
+ - ./traefik/traefik.local.yml:/etc/traefik/traefik.yml:ro
+ - ./traefik/traefik-dynamic.local.yml:/etc/traefik/traefik-dynamic.local.yml:ro
diff --git a/docker-compose.override.yml b/docker-compose.override.yml
index 5ecfff74..5df97877 100644
--- a/docker-compose.override.yml
+++ b/docker-compose.override.yml
@@ -45,10 +45,3 @@ services:
clustering:
volumes:
- ./clustering/src:/usr/src/app/src
-
- tracking:
- volumes:
- - ./tracking/src:/usr/src/app/src
-
- # database:
- # This component does not need to be changed
diff --git a/docker-compose.yml b/docker-compose.yml
index 9f5bb1f7..b25da03d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -11,7 +11,6 @@ services:
# These components have API endpoints managed by traefik
- load-balancer
- embedding
- - tracking
ports:
- ${TRAEFIK_DASHBOARD_PORT}:8080
- ${TRAEFIK_HTTP_PORT}:80
@@ -82,15 +81,9 @@ services:
restart: unless-stopped
depends_on:
- load-balancer
- - database
expose:
- 8000
environment:
- - DATABASE_HOST
- - DATABASE_PORT
- - DATABASE_NAME
- - EMBEDDING_DATABASE_USER
- - EMBEDDING_DATABASE_PWD
- AUTHORIZATION_SECRET
- BALANCER_QUEUE_FREQUENCY
- BALANCER_GETTASK_URL
@@ -118,15 +111,9 @@ services:
restart: unless-stopped
depends_on:
- load-balancer
- - database
expose:
- 8000
environment:
- - DATABASE_HOST
- - DATABASE_PORT
- - DATABASE_NAME
- - CLUSTERING_DATABASE_USER
- - CLUSTERING_DATABASE_PWD
- AUTHORIZATION_SECRET
- BALANCER_QUEUE_FREQUENCY
- BALANCER_GETTASK_URL
@@ -137,52 +124,6 @@ services:
labels:
- traefik.enable=true
- # http://localhost/tracking
- tracking:
- build:
- context: .
- dockerfile: ./tracking/Dockerfile
- image: athene-tracking
- container_name: athene-tracking
- restart: unless-stopped
- depends_on:
- - database
- expose:
- - 8000
- environment:
- - AUTHORIZATION_SECRET
- - DATABASE_HOST
- - DATABASE_PORT
- - DATABASE_NAME
- - TRACKING_DATABASE_USER
- - TRACKING_DATABASE_PWD
- working_dir: /usr/src/app
- networks:
- - athene
- labels:
- - traefik.enable=true
- - traefik.http.routers.tracking.rule=PathPrefix(`/tracking`)
- - traefik.http.routers.tracking.entrypoints=web
- - traefik.http.routers.tracking-tls.rule=PathPrefix(`/tracking`)
- - traefik.http.routers.tracking-tls.entrypoints=websecure
- - traefik.http.routers.tracking-tls.tls=true
-
- database:
- image: mongo:latest
- container_name: athene-mongodb
- restart: unless-stopped
- expose:
- - 27017
- environment:
- - MONGO_INITDB_DATABASE=${DATABASE_NAME}
- - MONGO_INITDB_ROOT_USERNAME=${DATABASE_ROOT_USERNAME}
- - MONGO_INITDB_ROOT_PASSWORD=${DATABASE_ROOT_PASSWORD}
- volumes:
- - ./init-mongo.js:/docker-entrypoint-initdb.d/init-mongo.js:ro
- - ./data/db:/data/db
- networks:
- - athene
-
networks:
athene:
driver: bridge
diff --git a/embedding/requirements.txt b/embedding/requirements.txt
index 1789ff6b..adcd696e 100644
--- a/embedding/requirements.txt
+++ b/embedding/requirements.txt
@@ -7,7 +7,6 @@ fastapi==0.95.2
joblib==1.2.0
nltk==3.8.1
numpy==1.20 # needs to be 1.20 because after that, np.int does not exist any more, which breaks nltk
-pymongo==4.3.3
requests==2.31.0
scikit-learn==0.22 # needs to be 0.22 because after that, sklearn.utils.linear_assignment_ is removed, which breaks allennlp
scipy==1.10.0
diff --git a/embedding/src/database/Connection.py b/embedding/src/database/Connection.py
deleted file mode 100644
index 183b0fff..00000000
--- a/embedding/src/database/Connection.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import os
-import pymongo
-
-
-# this class contains most of the important collection level pymongo operations but not all of them
-# for the whole list and detailed explanations - https://api.mongodb.com/python/current/api/pymongo/collection.html
-class Connection:
-
- def __init__(self):
- # Get container variables for datbase connection
- dbhost = str(os.environ['DATABASE_HOST']) if "DATABASE_HOST" in os.environ else "database"
- dbport = int(os.environ['DATABASE_PORT']) if "DATABASE_PORT" in os.environ else 27017
- dbname = str(os.environ['DATABASE_NAME']) if "DATABASE_NAME" in os.environ else "athene_db"
- dbuser = str(os.environ['EMBEDDING_DATABASE_USER']) if "EMBEDDING_DATABASE_USER" in os.environ else "embedding"
- dbpwd = str(os.environ['EMBEDDING_DATABASE_PWD']) if "EMBEDDING_DATABASE_PWD" in os.environ else "embedding_password"
- self.client = pymongo.MongoClient(host=dbhost, port=dbport, username=dbuser, password=dbpwd,
- authSource=dbname)
- self.db = self.client[dbname]
- self.collection = None
-
- # inserts one document to a collection
- # collection {string} - collection name to store the document
- # document {field-value pairs} - e.g. {'x': 1, 'y': "apples"}
- def insert_document(self, collection, document):
- try:
- self.collection = self.db[collection]
- self.collection.insert_one(document)
- except Exception as e:
- raise e
-
- # inserts an array of documents to a collection
- # collection {string} - collection name to store the document
- # document {array} - e.g. [{'x': 1, 'y': "apples"}, {'x': 15, 'y': "oranges", 'z': 40.5}]
- def insert_documents(self, collection, documents: []):
- try:
- self.collection = self.db[collection]
- self.collection.insert_many(documents)
- except Exception as e:
- raise e
-
- # query database and returns results
- # filter_dict {field-value pairs} - specifies elements which must be present in the resulting set
- # projection {field-value pairs} - list of field names should be included or excluded in the resulting set. e.g. {‘_id’: False} _id values will be excluded in the resulting set
- # skip {int} - number of documents to omit (from the start of the result set) when returning the results
- # limit {int} - max number of results to return
- # max_time_ms {int} - Specifies a time limit for a query operation. If the specified time is exceeded, the operation will be aborted
- def find_documents(self, collection, filter_dict, projection=None, skip=0, limit=0, max_time_ms=None):
- try:
- self.collection = self.db[collection]
- docs = self.collection.find(filter=filter_dict, projection=projection, skip=skip, limit=limit,
- max_time_ms=max_time_ms)
- except Exception as e:
- raise e
- else:
- return docs
-
- # update a document matching the filter
- # filter_dict {field-value pairs} - find the document to update e.g. {'x': 1}
- # update_dict {field-value pairs} - modifications to apply e.g. {'$set': {'x': 3}}
- # upsert {boolean} - if true performs insert when no documents match the filter
- # Note: For the full list of update parameters https://docs.mongodb.com/manual/reference/operator/update/
- def update_document(self, collection, filter_dict, update_dict, upsert=False):
- try:
- self.collection = self.db[collection]
- result = self.collection.update_one(filter_dict, update_dict, upsert)
- except Exception as e:
- raise e
- else:
- return result
-
- # updates one or more documents matching the filter
- def update_documents(self, collection, filter_dict, update_dict, upsert=False):
- try:
- self.collection = self.db[collection]
- result = self.collection.update_many(filter_dict, update_dict, upsert)
- except Exception as e:
- raise e
- else:
- return result
-
- def replace_document(self, collection, filter_dict, replacement_dict, upsert=False):
- try:
- self.collection = self.db[collection]
- result = self.collection.replace_one(filter_dict, replacement_dict, upsert)
- except Exception as e:
- raise e
- else:
- return result
-
- # deletes one document matching the filter
- def delete_document(self, collection, filter_dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.delete_one(filter_dict)
- except Exception as e:
- raise e
- else:
- return result
-
- # deletes one or more documents matching the filter
- def delete_documents(self, collection, filter_dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.delete_many(filter_dict)
- except Exception as e:
- raise e
- else:
- return result
-
- # counts the number of documents in collection matching the filter
- def count_documents(self, collection, filter_dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.count_documents(filter_dict)
- except Exception as e:
- raise e
- else:
- return result
-
- def get_collection_names(self):
- return self.db.collection_names()
diff --git a/embedding/src/database/__init__.py b/embedding/src/database/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/embedding/src/feedback/FeedbackCommentRequest.py b/embedding/src/feedback/FeedbackCommentRequest.py
deleted file mode 100644
index eb42de5c..00000000
--- a/embedding/src/feedback/FeedbackCommentRequest.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import json
-from logging import getLogger
-from fastapi import APIRouter, Request
-from src.errors import invalidJson, requireFeedbackWithTextBlock, requireExerciseId
-from src.entities import FeedbackWithTextBlock, Feedback
-from src.feedback.FeedbackConsistency import FeedbackConsistency
-
-logger = getLogger(name="FeedbackCommentRequest")
-router = APIRouter()
-
-@router.post("/feedback_consistency")
-async def feedback(request: Request):
- logger.debug("-" * 80)
- logger.info("Start processing Feedback Comment Request:")
-
- # Parse json
- try:
- doc = await request.json()
- except Exception as e:
- logger.error("Exception while parsing json: {}".format(str(e)))
- raise invalidJson
-
- logger.info("Request: {}".format(doc))
- if "feedbackWithTextBlock" not in doc:
- logger.error("{}".format(requireFeedbackWithTextBlock.detail))
- raise requireFeedbackWithTextBlock
-
- if "exerciseId" not in doc:
- logger.error("{}".format(requireExerciseId.detail))
- raise requireExerciseId
-
- blocks: list[FeedbackWithTextBlock] = []
-
- for fwt in doc['feedbackWithTextBlock']:
- blocks.append(FeedbackWithTextBlock(fwt['textBlockId'], fwt['clusterId'], fwt['text'], Feedback(fwt['feedbackId'], fwt['feedbackText'], fwt['credits'])))
-
- __fc = FeedbackConsistency(doc['exerciseId'])
- response = __fc.check_consistency(feedback_with_text_blocks=blocks)
- logger.info("Response {}".format(response))
- __fc.store_feedback()
-
- logger.info("Completed Feedback Comment Embedding Request.")
- logger.debug("-" * 80)
- return response
diff --git a/embedding/src/feedback/FeedbackCommentResource.py b/embedding/src/feedback/FeedbackCommentResource.py
deleted file mode 100644
index 05d96ee6..00000000
--- a/embedding/src/feedback/FeedbackCommentResource.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import numpy as np
-import os
-import json
-import pickle
-import requests
-from logging import getLogger
-from typing import List
-from src.elmo import ELMo
-from src.database.Connection import Connection
-from src.entities import FeedbackWithTextBlock, Feedback, Sentence, ElmoVector
-
-# Get container variable for segmentation url
-SEGMENTATION_URL = str(os.environ['SEGMENTATION_URL']) if "SEGMENTATION_URL" in os.environ else "http://segmentation:8000/segment"
-
-
-class FeedbackCommentResource:
- __logger = getLogger(__name__)
- __collection = 'feedback_consistency'
-
- def __init__(self, exercise_id):
- self.__elmo = ELMo()
- self.__conn = Connection()
- self.__collection = 'feedback_consistency_' + (str(exercise_id) if exercise_id != -1 else 'test')
-
- def __segment_feedback_comments(self, feedback_with_tb: list):
- self.__logger.info("Segment Feedback Comments.")
- feedback = []
- for f in feedback_with_tb:
- feedback.append({"id": f.feedback.id, "text": f.feedback.text})
-
- request = {"feedback": feedback}
- return self.post(SEGMENTATION_URL, request)
-
- def __embed_sentences(self, sentence: List[Sentence]):
- return self.__elmo.embed_sentences(sentence)
-
- def __create_feedback_document(self, feedback_with_tb: FeedbackWithTextBlock):
- embeddings = []
- for embedding in feedback_with_tb.feedback.feedbackEmbeddings:
- embeddings.append({'embedding': pickle.dumps(np.array(embedding).flatten().tolist())})
-
- doc = {'_id': feedback_with_tb.id,
- 'cluster_id': feedback_with_tb.cluster_id,
- 'text': feedback_with_tb.text,
- 'text_embedding': pickle.dumps(np.array(feedback_with_tb.text_embedding).flatten().tolist()),
- 'feedback': {'feedback_id': feedback_with_tb.feedback.id,
- 'feedback_text': feedback_with_tb.feedback.text,
- 'feedback_score': feedback_with_tb.feedback.score,
- 'feedback_text_blocks': embeddings}}
-
- return doc
-
- def __replace_insert_documents(self, documents: []):
- self.__logger.info("Replace-Insert Feedback.")
- for doc in documents:
- __filter = {'_id': doc['_id']}
- try:
- result = self.__conn.replace_document(collection=self.__collection, filter_dict=__filter,
- replacement_dict=doc, upsert=True)
- except Exception as e:
- self.__logger.error(e)
- else:
- self.__logger.info(
- "Modified Count: {} Upserted id {}".format(result.modified_count, result.upserted_id))
-
- def embed_feedback(self, feedback_with_tb: list):
- self.__logger.info("Embed Feedback.")
- segmented_feedback_comments = self.__segment_feedback_comments(feedback_with_tb)
-
- for fwt in feedback_with_tb:
- blocks = (blocks for blocks in segmented_feedback_comments['textBlocks'] if fwt.feedback.id == blocks['id'])
- sentences: List[Sentence] = list(map(lambda b: fwt.feedback.text[b['startIndex']:b['endIndex']], blocks))
- vectors: List[ElmoVector] = self.__embed_sentences(sentences)
- for v in vectors:
- fwt.add_feedback_embedding(v)
-
- return feedback_with_tb
-
- def embed_feedback_text_blocks(self, feedback_with_tb: list):
- sentences: List[Sentence] = list(map(lambda b: b.text, feedback_with_tb))
- vectors: List[ElmoVector] = self.__embed_sentences(sentences)
- for fwt, vector in zip(feedback_with_tb, vectors):
- fwt.text_embedding = vector
- return feedback_with_tb
-
- def store_feedback(self, feedback_with_tb: list):
- self.__logger.info("Store Feedback.")
-
- docs = []
- for fwt in feedback_with_tb:
- docs.append(self.__create_feedback_document(feedback_with_tb=fwt))
-
- self.__replace_insert_documents(documents=docs)
-
- def get_feedback_in_same_cluster(self, cluster_id: str, feedback_id: str):
- self.__logger.info("Get feedback with same cluster id.")
- _filter = {'$and': [{'cluster_id': cluster_id}, {'feedback.feedback_id': {'$ne': feedback_id}}]}
- try:
- result = self.__conn.find_documents(collection=self.__collection, filter_dict=_filter)
- except Exception as e:
- self.__logger.error(e)
- return None
- else:
- return result
-
- def set_feedback_consistency_results(self, collection, doc):
- try:
- result = self.__conn.insert_document(collection=collection, document=doc)
- except Exception as e:
- self.__logger.error(e)
- return None
- else:
- return result
-
- def post(self, api_endpoint, data):
- response = requests.post(url=api_endpoint, json=data)
-
- if not response:
- self.__logger.error("POST failed on {}: Status Code: {} Response: {}".format(api_endpoint,
- response.status_code,
- response.content))
- return None
-
- return json.loads(response.json())
diff --git a/embedding/src/feedback/FeedbackConsistency.py b/embedding/src/feedback/FeedbackConsistency.py
deleted file mode 100644
index d5aa37f1..00000000
--- a/embedding/src/feedback/FeedbackConsistency.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import pickle
-import numpy as np
-from logging import getLogger
-from sklearn.metrics import pairwise_distances, silhouette_score
-from src.feedback.FeedbackCommentResource import FeedbackCommentResource
-
-
-class FeedbackConsistency:
- __logger = getLogger(__name__)
- __feedback_with_text_blocks: list = None
-
- def __init__(self, exercise_id):
- self.__feedback_comment_resource = FeedbackCommentResource(exercise_id)
- self.__comment_threshold = 0.37
- self.__text_block_threshold = 0.21
-
- def __get_inconsistency(self, score_diff: float, comment_distance: float, text_block_distance: float):
- if text_block_distance < self.__text_block_threshold:
- if score_diff:
- return 'INCONSISTENT_SCORE' if comment_distance < self.__comment_threshold else 'INCONSISTENT_FEEDBACK'
- else:
- return 'INCONSISTENT_COMMENT' if comment_distance > self.__comment_threshold else None
- else:
- return None
-
- def __calculate_distance_with_silhouette_score(self, x: [], y: []):
- if len(x) < 2 and len(y) < 2:
- distance = pairwise_distances(X=x, Y=y, metric='cosine').flatten()[0]
- else:
- samples = np.concatenate((x, y))
- labels = np.concatenate([np.full((1, len(x)), 1).flatten(), np.full((1, len(y)), 2).flatten()])
- distance = silhouette_score(X=samples, labels=labels, metric='cosine')
- return distance
-
- def __calculate_mean_distance(self, x: [], y: []):
- distance = pairwise_distances(X=x, Y=y, metric='cosine')
- return np.mean(np.mean(distance, axis=1))
-
- def check_consistency(self, feedback_with_text_blocks):
- self.__logger.info("Check Consistencies")
- # Find embeddings for each feedback comment
- self.__feedback_with_text_blocks = self.__feedback_comment_resource.embed_feedback(
- feedback_with_tb=feedback_with_text_blocks)
- # Find embeddings for each student text
- self.__feedback_with_text_blocks = self.__feedback_comment_resource.embed_feedback_text_blocks(
- feedback_with_tb=feedback_with_text_blocks)
- doc = []
- # Compare each new assessment with the ones in the database
- for fwt in self.__feedback_with_text_blocks:
- feedback_vector_x = fwt.feedback.feedbackEmbeddings
- student_text_vector_x = fwt.text_embedding.reshape(1, -1).tolist()
- # Get the assessments which have same the same cluster id
- cluster = self.__feedback_comment_resource.get_feedback_in_same_cluster(cluster_id=fwt.cluster_id,
- feedback_id=fwt.feedback.id)
- # Calculate distances between each feedback embeddings and text block embeddings(student answers)
- for item in cluster:
- feedback_vector_y = list(map(lambda embedding: pickle.loads(embedding['embedding']),
- item['feedback']['feedback_text_blocks']))
- student_text_vector_y = np.array(pickle.loads(item['text_embedding'])).reshape(1, -1).tolist()
- feedback_distance = self.__calculate_mean_distance(x=feedback_vector_x, y=feedback_vector_y)
- text_block_distance = self.__calculate_mean_distance(x=student_text_vector_x, y=student_text_vector_y)
- inconsistency = self.__get_inconsistency(
- score_diff=abs(fwt.feedback.score - item['feedback']['feedback_score']),
- comment_distance=feedback_distance, text_block_distance=text_block_distance)
- if inconsistency:
- doc.append({"firstFeedbackId": fwt.feedback.id, "secondFeedbackId": item['feedback']['feedback_id'], "type": inconsistency})
-
- return {'feedbackInconsistencies': doc}
-
- def store_feedback(self):
- self.__feedback_comment_resource.store_feedback(self.__feedback_with_text_blocks)
diff --git a/embedding/src/feedback/__init__.py b/embedding/src/feedback/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/embedding/src/main.py b/embedding/src/main.py
index def5b6fd..74c5dec7 100644
--- a/embedding/src/main.py
+++ b/embedding/src/main.py
@@ -3,7 +3,6 @@
from fastapi import FastAPI, Request, Response, BackgroundTasks
from src.TimerHandler import TimerHandler
from src import UploadingResource
-from src.feedback import FeedbackCommentRequest
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
@@ -19,7 +18,6 @@
app = FastAPI()
app.include_router(UploadingResource.router)
-app.include_router(FeedbackCommentRequest.router)
@app.post("/trigger")
diff --git a/init-mongo.js b/init-mongo.js
deleted file mode 100644
index 2a22120f..00000000
--- a/init-mongo.js
+++ /dev/null
@@ -1,37 +0,0 @@
-db.createUser(
- {
- user: "embedding",
- pwd: "embedding_password",
- roles: [
- {
- role: "readWrite",
- db: "athene_db"
- }
- ]
- }
-);
-db.createUser(
- {
- user: "clustering",
- pwd: "clustering_password",
- roles: [
- {
- role: "readWrite",
- db: "athene_db"
- }
- ]
- }
-);
-
-db.createUser(
- {
- user: "tracking",
- pwd: "tracking_password",
- roles: [
- {
- role: "readWrite",
- db: "athene_db"
- }
- ]
- }
-);
diff --git a/tracking/Dockerfile b/tracking/Dockerfile
deleted file mode 100644
index 0cf2261f..00000000
--- a/tracking/Dockerfile
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10
-
-COPY ./tracking/requirements.txt requirements.txt
-RUN pip install -r requirements.txt
-
-COPY ./tracking/src/ src/
-
-EXPOSE 8000
-CMD uvicorn --host 0.0.0.0 --port 8000 src.main:app
diff --git a/tracking/Makefile b/tracking/Makefile
deleted file mode 100644
index b23f7b7b..00000000
--- a/tracking/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-#!make
-
-all: .venv
-
-.venv: requirements.txt
- python -m venv .venv
- source .venv/bin/activate; pip install -r requirements.txt
-
-start:
- source .venv/bin/activate; python start.py
-
-clean:
- rm -rf .venv
-
-.PHONY: all start clean
\ No newline at end of file
diff --git a/tracking/README.md b/tracking/README.md
deleted file mode 100644
index e19e3d72..00000000
--- a/tracking/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Tracking Service
-
-## Start locally (without Docker)
-
-Locally, the service runs on port 8003. To start it,
-
-* first, run the following command for some preparations:
- ```bash
- make
- ```
- This will create a virtual environment and install all dependencies.
-
-* After that, configure the used virtual environment:
- ```bash
- source venv/bin/activate
- ```
- If you use an IDE, you can also configure the virtual environment there.
- In PyCharm, you can even go to `File > Open`, choose the embedding folder
- and then choose the `Attach` option.
-
-* Then, you can start the tracking server using `python start.py` or using your IDE.
-
-## Start with Docker
-
-Use the `docker-compose.yml` file from the parent directory
-to start the embedding service (and all others) with Docker.
diff --git a/tracking/requirements.txt b/tracking/requirements.txt
deleted file mode 100644
index e551adc1..00000000
--- a/tracking/requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-fastapi==0.90.0
-jaro-winkler==2.0.3
-numpy==1.24.2
-pandas==1.5.3
-PyJWT==2.6.0
-pymongo==4.3.3
-scikit-learn==1.2.1
-scipy==1.10.0
-uvicorn==0.20.0
diff --git a/tracking/src/__init__.py b/tracking/src/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tracking/src/database/Connection.py b/tracking/src/database/Connection.py
deleted file mode 100644
index d0bf0dae..00000000
--- a/tracking/src/database/Connection.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import os
-import pymongo
-import pandas as pd
-
-
-# this class contains most of the important collection level pymongo operations but not all of them
-# for the whole list and detailed explanations - https://api.mongodb.com/python/current/api/pymongo/collection.html
-class Connection:
-
- def __init__(self):
- # Get container variables for datbase connection
- dbhost = str(os.environ['DATABASE_HOST']) if "DATABASE_HOST" in os.environ else "database"
- dbport = int(os.environ['DATABASE_PORT']) if "DATABASE_PORT" in os.environ else 27017
- dbname = str(os.environ['DATABASE_NAME']) if "DATABASE_NAME" in os.environ else "athene_db"
- dbuser = str(os.environ['TRACKING_DATABASE_USER']) if "TRACKING_DATABASE_USER" in os.environ else "tracking"
- dbpwd = str(os.environ['TRACKING_DATABASE_PWD']) if "TRACKING_DATABASE_PWD" in os.environ else "tracking_password"
- self.client = pymongo.MongoClient(host=dbhost, port=dbport, username=dbuser, password=dbpwd,
- authSource=dbname)
- self.db = self.client[dbname]
- self.collection = None
-
- # inserts one document to a collection
- # collection {string} - collection name to store the document
- # document {field-value pairs} - e.g. {'x': 1, 'y': "apples"}
- def insert_document(self, collection: str, document: dict):
- self.collection = self.db[collection]
- self.collection.insert_one(document)
-
- # inserts an array of documents to a collection
- # collection {string} - collection name to store the document
- # document {array} - e.g. [{'x': 1, 'y': "apples"}, {'x': 15, 'y': "oranges", 'z': 40.5}]
- def insert_documents(self, collection: str, documents: [dict]):
- try:
- self.collection = self.db[collection]
- self.collection.insert_many(documents)
- except Exception as e:
- print(e)
-
- # query database and returns results
- # filter_dict {field-value pairs} - specifies elements which must be present in the resulting set
- # projection {field-value pairs} - list of field names should be included or excluded in the resulting set. e.g. {‘_id’: False} _id values will be excluded in the resulting set
- # skip {int} - number of documents to omit (from the start of the result set) when returning the results
- # limit {int} - max number of results to return
- # max_time_ms {int} - Specifies a time limit for a query operation. If the specified time is exceeded, the operation will be aborted
- def find_documents(self, collection: str, filter_dict: dict, projection: dict = None, skip: int = 0, limit: int = 0,
- max_time_ms: int = None):
- try:
- self.collection = self.db[collection]
- docs = self.collection.find(filter=filter_dict, projection=projection, skip=skip, limit=limit,
- max_time_ms=max_time_ms)
- except Exception as e:
- print(e)
- else:
- return docs
-
- # update a document matching the filter
- # filter_dict {field-value pairs} - find the document to update e.g. {'x': 1}
- # update_dict {field-value pairs} - modifications to apply e.g. {'$set': {'x': 3}}
- # upsert {boolean} - if true performs insert when no documents match the filter
- # Note: For the full list of update parameters https://docs.mongodb.com/manual/reference/operator/update/
- def update_document(self, collection: str, filter_dict: dict, update_dict: dict, upsert: bool = False):
- try:
- self.collection = self.db[collection]
- result = self.collection.update_one(filter_dict, update_dict, upsert)
- except Exception as e:
- print(e)
- else:
- return result
-
- # updates one or more documents matching the filter
- def update_documents(self, collection: str, filter_dict: dict, update_dict: dict, upsert: bool = False):
- try:
- self.collection = self.db[collection]
- result = self.collection.update_many(filter_dict, update_dict, upsert)
- except Exception as e:
- print(e)
- else:
- return result
-
- # deletes one document matching the filter
- def delete_document(self, collection: str, filter_dict: dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.delete_one(filter_dict)
- except Exception as e:
- print(e)
- else:
- return result
-
- # deletes one or more documents matching the filter
- def delete_documents(self, collection: str, filter_dict: dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.delete_many(filter_dict)
- except Exception as e:
- print(e)
- else:
- return result
-
- # counts the number of documents in collection matching the filter
- def count_documents(self, collection: str, filter_dict: dict):
- try:
- self.collection = self.db[collection]
- result = self.collection.count_documents(filter_dict)
- except Exception as e:
- print(e)
- else:
- return result
-
- def get_collection_names(self):
- return self.db.collection_names()
-
- def get_data_for_evaluation(self, exercise_id: int):
- try:
- self.collection = self.db.feedback
- pipeline = [
- {'$match': {"participation.exercise.id": exercise_id}},
- {"$unwind": {'path': '$participation.results', 'preserveNullAndEmptyArrays': True}},
- {'$unwind': {'path': '$participation.results.feedbacks', 'preserveNullAndEmptyArrays': True}},
- {'$project': {
- 'ID': '$ID',
- 'pID': '$participation.id',
- 'feedbacks': '$participation.results.feedbacks'
- }},
- ]
-
- # df = pd.json_normalize(collection.find({"participation.exercise.id": 1830}, {"participation.results": 1}))
- query_result = self.collection.aggregate(pipeline)
- query_result = list(query_result)
- df = pd.json_normalize(query_result)
-
- if len(df.index) == 0:
- print(f'Exercise {exercise_id} was not tracked!')
- return
-
- # sort feedback by textblock reference
- df = df.sort_values('feedbacks.reference')
-
- # remove newline characters from feedbacks to prevent csv from breaking
- df = df.replace('\n', ' ', regex=True)
-
- # write dataframe to csv
- pd.DataFrame.to_csv(df, './similarity.csv', ';')
-
- return df
- except Exception as e:
- print(e)
- else:
- return result
diff --git a/tracking/src/database/__init__.py b/tracking/src/database/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tracking/src/main.py b/tracking/src/main.py
deleted file mode 100644
index 9d7de4a7..00000000
--- a/tracking/src/main.py
+++ /dev/null
@@ -1,251 +0,0 @@
-import base64
-from math import sqrt
-from os import environ
-
-import jaro
-import numpy as np
-import pandas as pd
-from enum import Enum
-from fastapi import FastAPI, Request, Response, status
-from jwt import decode
-from sklearn import preprocessing
-from sklearn.metrics import cohen_kappa_score
-
-from .database.Connection import Connection
-
-app = FastAPI()
-
-@app.post('/tracking/text-exercise-assessment', status_code=201)
-async def track(request: Request, response: Response):
- feedback = await request.json()
- jwt_token = request.headers.get('x-athene-tracking-authorization')
- secret_base64 = environ['AUTHORIZATION_SECRET']
- try:
- encoded_jwt_token = decode(jwt_token, base64.b64decode(secret_base64), verify=True, algorithms=['HS256'])
- if encoded_jwt_token.get('result_id') != feedback.get('participation').get('results')[0].get('id'):
- response.status_code = status.HTTP_403_FORBIDDEN
- return {'Please do not spam manually!'}
- except Exception as e:
- print(e)
- response.status_code = status.HTTP_401_UNAUTHORIZED
- return {'Your token is not valid!'}
- try:
- conn = Connection()
- conn.insert_document('feedback', feedback)
- except Exception as e:
- print(e)
- response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
- return {'message': 'Saving in the database did not work!'}
- return {'Feedback is tracked successfully'}
-
-
-@app.get('/tracking/exerciseId/{exercise_id}', status_code=200)
-async def evaluate(exercise_id: int, response: Response):
- try:
- conn = Connection()
- raw_data = conn.get_data_for_evaluation(exercise_id)
- metrics = calculate_metrics(raw_data, exercise_id)
- return metrics
- except Exception as e:
- print(e)
- response.status_code = status.HTTP_404_NOT_FOUND
- return {'message': 'There is no data for this exercise!'}
-
-
-class FeedbackType(Enum):
- Automatic = 1
- Typo = 2
- Extended = 3
- Different = 4
-
-
-def cohens_kappa(l1, l2):
- # transfrom float values into distinc categories
- enc = preprocessing.LabelEncoder()
- enc.fit(np.hstack((l1, l2)))
-
- # calculate kappa
- kappa_val = cohen_kappa_score(enc.transform(l1), enc.transform(l2))
- return kappa_val
-
-
-def jaro_winkler(s1: str, s2: str):
- dis = jaro.jaro_winkler_metric(s1, s2)
- # print(f'Jaro-Winkler: {dis}')
- return dis
-
-
-def jaro_metric(s1: str, s2: str):
- dis = jaro.jaro_metric(s1, s2)
- # print(f'Jaro: {dis}')
- return dis
-
-
-# Calculates the normalized Levenshtein distance of 2 strings
-def levenshtein(s1, s2):
- l1 = len(s1)
- l2 = len(s2)
- matrix = [list(range(l1 + 1))] * (l2 + 1)
- for zz in list(range(l2 + 1)):
- matrix[zz] = list(range(zz, zz + l1 + 1))
- for zz in list(range(0, l2)):
- for sz in list(range(0, l1)):
- if s1[sz] == s2[zz]:
- matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, matrix[zz][sz + 1] + 1, matrix[zz][sz])
- else:
- matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, matrix[zz][sz + 1] + 1, matrix[zz][sz] + 1)
- distance = float(matrix[l2][l1])
- result = 1.0 - distance / max(l1, l2)
- # print(f'Levenshtein: {result}')
- return result
-
-
-# Dynamic Programming implementation of LCS problem
-
-def lcs(s1, s2):
- # find the length of the strings
- m = len(s1)
- n = len(s2)
-
- # declaring the array for storing the dp values
- L = [[None] * (n + 1) for i in range(m + 1)]
-
- """Following steps build L[m + 1][n + 1] in bottom up fashion
- Note: L[i][j] contains length of LCS of X[0..i-1]
- and Y[0..j-1]"""
- for i in range(m + 1):
- for j in range(n + 1):
- if i == 0 or j == 0:
- L[i][j] = 0
- elif s1[i - 1] == s2[j - 1]:
- L[i][j] = L[i - 1][j - 1] + 1
- else:
- L[i][j] = max(L[i - 1][j], L[i][j - 1])
-
- # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
- # print(f'LCS: {L[m][n] / min(len(s1), len(s2))} (absolute: {L[m][n]}; length_s1: {len(s1)}, length_s2: {len(s2)})')
-
- # prevent division by zero if LCS is 0
- if L[m][n] > 0:
- return min(len(s1), len(s2)) / L[m][n]
- else:
- return 0
-
-
-def st_mean_diff(l1, l2):
- mean_l1 = np.mean(l1)
- mean_l2 = np.mean(l2)
- std_l1 = np.std(l1)
- std_l2 = np.std(l2)
-
- diff = abs((mean_l1 - mean_l2) / sqrt((std_l1 + std_l2) / 2))
-
- return diff
-
-
-def calculate_duration(start, end):
- # calculate submission time
- start_time = start.generation_time
- end_time = end.generation_time
-
- timedelta = end_time - start_time
-
- return timedelta.total_seconds()
-
-
-def classify_comment(s1: str, s2: str):
- if s1 == s2:
- return FeedbackType.Automatic
- elif levenshtein(s1, s2) > 0.9:
- return FeedbackType.Typo
- elif lcs(s1, s2) > 0.95 and jaro_winkler(s1, s2) > 0.6:
- return FeedbackType.Extended
- else:
- return FeedbackType.Different
-
-
-def calculate_metrics(df: pd.DataFrame, exercise_id: int):
- score_first_feedbacks = []
- score_last_feedbacks = []
- automatic_assessment_times = []
- manual_assessment_times = []
- assessed_participations = []
-
- type_count = {
- str(FeedbackType.Automatic): 0,
- str(FeedbackType.Typo): 0,
- str(FeedbackType.Extended): 0,
- str(FeedbackType.Different): 0
- }
- count = 0
- log_count = 0
-
- for reference, df_reference in df.groupby('feedbacks.reference'):
- # sort df since sorting before changed order
- df_reference = df_reference.sort_index()
- count += 1
- # make sure that automatic feedback was provided
- if len(df_reference.index) > 1:
- head = df_reference.head(1)
- tail = df_reference.tail(1)
- if df_reference.head(1)['feedbacks.type'].values[0] == 'AUTOMATIC':
- # only the first and the last entry are important
- score_first_feedbacks.append(head['feedbacks.credits'].values[0])
- score_last_feedbacks.append(tail['feedbacks.credits'].values[0])
-
- # classify feedback
- automatic_comment = head['feedbacks.detailText'].values[0]
- human_comment = tail['feedbacks.detailText'].values[0]
-
- # only calculate duration once for each participation
- participation_id = df_reference.head(1)['pID'].values[0]
- if participation_id not in assessed_participations:
- # calculate duration
- duration = calculate_duration(head['_id'].values[0], tail['_id'].values[0])
- automatic_assessment_times.append(duration)
- assessed_participations.append(participation_id)
-
- feedback_type = classify_comment(str(automatic_comment), str(human_comment))
- type_count[str(feedback_type)] += 1
- else:
- # only calculate duration once for each participation
- participation_id = df_reference.head(1)['pID'].values[0]
- if participation_id not in assessed_participations:
- duration = calculate_duration(head['_id'].values[0], tail['_id'].values[0])
- manual_assessment_times.append(duration)
- assessed_participations.append(participation_id)
- else:
- if df_reference.head(1)['feedbacks.type'].values[0] == 'MANUAL':
- log_count += 1
-
- percentage_provided = round(len(score_first_feedbacks) / count * 100, 2)
-
- # print(log_count)
-
- kappa_val = cohens_kappa(score_first_feedbacks, score_last_feedbacks)
-
- diff = st_mean_diff(score_first_feedbacks, score_last_feedbacks)
-
- metrics = {
- 'exerciseId': exercise_id,
- 'sample_size_total': count,
- 'sample_size_metrics': len(score_first_feedbacks),
- 'percentage_provided': percentage_provided,
- 'cohens_kappa': round(kappa_val, 4),
- 'std_mean_score_diff': round(diff, 4),
- 'comment_distribution': type_count,
- 'percentage_automatic_feedback': round(type_count[str(FeedbackType.Automatic)] / len(score_first_feedbacks), 4),
- 'automatic_assessment_duration': {
- 'min_seconds': round(np.min(automatic_assessment_times), 2),
- 'max_seconds': round(np.max(automatic_assessment_times), 2),
- 'mean_seconds': round(np.mean(automatic_assessment_times), 2),
- 'median_seconds': round(np.median(automatic_assessment_times), 2)},
- 'manual_assessment_duration': {
- 'min_seconds': round(np.min(manual_assessment_times), 2),
- 'max_seconds': round(np.max(manual_assessment_times), 2),
- 'mean_seconds': round(np.mean(manual_assessment_times), 2),
- 'median_seconds': round(np.median(manual_assessment_times), 2)}
- }
-
- print(metrics)
- return metrics
diff --git a/tracking/start.py b/tracking/start.py
deleted file mode 100644
index 242a3e3a..00000000
--- a/tracking/start.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import uvicorn
-
-
-def start():
- uvicorn.run("src.main:app", host="127.0.0.1", port=8004, reload=True, reload_dirs=["src"])
-
-
-if __name__ == "__main__":
- start()
diff --git a/traefik/traefik-dynamic.local.yml b/traefik/traefik-dynamic.local.yml
index f8375cda..5216b711 100644
--- a/traefik/traefik-dynamic.local.yml
+++ b/traefik/traefik-dynamic.local.yml
@@ -23,12 +23,6 @@ http:
entryPoints:
- web
service: embedding
- # http://localhost/tracking
- tracking:
- rule: "Path(`/tracking`)"
- entryPoints:
- - web
- service: tracking
services:
test:
loadBalancer:
@@ -49,7 +43,3 @@ http:
loadBalancer:
servers:
- url: "http://host.docker.internal:8003"
- tracking:
- loadBalancer:
- servers:
- - url: "http://host.docker.internal:8004"