diff --git a/.dockerignore b/.dockerignore index 0a3bc654..b9084352 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,7 +7,6 @@ .venv .vscode data -init-mongo.js # files and directories in the whole project **/.gitignore diff --git a/.env b/.env index cff01ddb..9358193e 100644 --- a/.env +++ b/.env @@ -12,9 +12,6 @@ LOAD_BALANCER_CONFIG_FILE_PATH=src/node_config.docker.yml # shared worker variables AUTHORIZATION_SECRET=YWVuaXF1YWRpNWNlaXJpNmFlbTZkb283dXphaVF1b29oM3J1MWNoYWlyNHRoZWUzb2huZ2FpM211bGVlM0VpcAo= -DATABASE_HOST=database -DATABASE_PORT=27017 -DATABASE_NAME=athene_db BALANCER_QUEUE_FREQUENCY=600 BALANCER_GETTASK_URL=http://load-balancer:8000/getTask BALANCER_SENDRESULT_URL=http://load-balancer:8000/sendTaskResult @@ -23,18 +20,4 @@ BALANCER_SENDRESULT_URL=http://load-balancer:8000/sendTaskResult # embedding variables EMBEDDING_CLOUD_CONFIG_PATH=./embedding/src/cloud/config.py -EMBEDDING_DATABASE_USER=embedding -EMBEDDING_DATABASE_PWD=embedding_password EMBEDDING_CHUNK_SIZE=50 - -# clustering variables -CLUSTERING_DATABASE_USER=embedding -CLUSTERING_DATABASE_PWD=embedding_password - -# tracking variables -TRACKING_DATABASE_USER=tracking -TRACKING_DATABASE_PWD=tracking_password - -# database variables -DATABASE_ROOT_USERNAME=root -DATABASE_ROOT_PASSWORD=root_password diff --git a/.github/workflows/dockerimage.yml b/.github/workflows/dockerimage.yml index d2adcd43..6c10efd3 100644 --- a/.github/workflows/dockerimage.yml +++ b/.github/workflows/dockerimage.yml @@ -91,22 +91,3 @@ jobs: run: ./.github/workflows/scripts/dockerimage.sh "clustering" - name: Run unittests for clustering-component run: docker run -i --rm --entrypoint python ghcr.io/ls1intum/athena/clustering:${GITHUB_REF##*/} -m unittest discover -p test_*.py - - athene-tracking: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${GITHUB_ACTOR} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USER }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - if: github.ref == 'refs/heads/master' - - name: Build and Push the athene-tracking Docker image - run: ./.github/workflows/scripts/dockerimage.sh "tracking" diff --git a/.github/workflows/scripts/dockerimage.sh b/.github/workflows/scripts/dockerimage.sh index 227473cc..8f5d4f57 100755 --- a/.github/workflows/scripts/dockerimage.sh +++ b/.github/workflows/scripts/dockerimage.sh @@ -1,6 +1,6 @@ #!/bin/bash -COMPONENT=$1 # Parameter $1 (Component): either "load-balancer", "segmentation", "embedding", "clustering" or "tracking" +COMPONENT=$1 # Parameter $1 (Component): either "load-balancer", "segmentation", "embedding" or "clustering" echo -e "INFO: Building ${COMPONENT}-component" diff --git a/.local.env b/.local.env index b2004ab2..155dae88 100644 --- a/.local.env +++ b/.local.env @@ -11,9 +11,6 @@ LOAD_BALANCER_CONFIG_FILE_PATH=src/node_config.local.yml # shared worker variables AUTHORIZATION_SECRET=YWVuaXF1YWRpNWNlaXJpNmFlbTZkb283dXphaVF1b29oM3J1MWNoYWlyNHRoZWUzb2huZ2FpM211bGVlM0VpcAo= -DATABASE_HOST=database -DATABASE_PORT=27017 -DATABASE_NAME=athene_db BALANCER_QUEUE_FREQUENCY=600 BALANCER_GETTASK_URL=http://localhost:8000/getTask BALANCER_SENDRESULT_URL=http://localhost:8000/sendTaskResult @@ -21,18 +18,4 @@ BALANCER_SENDRESULT_URL=http://localhost:8000/sendTaskResult # segmentation variables # embedding variables -EMBEDDING_DATABASE_USER=embedding -EMBEDDING_DATABASE_PWD=embedding_password EMBEDDING_CHUNK_SIZE=50 - -# clustering variables -CLUSTERING_DATABASE_USER=embedding -CLUSTERING_DATABASE_PWD=embedding_password - -# tracking variables -TRACKING_DATABASE_USER=tracking -TRACKING_DATABASE_PWD=tracking_password - -# database variables -DATABASE_ROOT_USERNAME=root -DATABASE_ROOT_PASSWORD=root_password diff --git a/.run/All Services.run.xml b/.run/All Services.run.xml index b068262e..c30f9176 100644 --- a/.run/All Services.run.xml +++ b/.run/All Services.run.xml @@ -5,7 +5,6 @@ - \ No newline at end of file diff --git a/.run/Traefik and DB.run.xml b/.run/Traefik.run.xml similarity index 64% rename from .run/Traefik and DB.run.xml rename to .run/Traefik.run.xml index 2bc70111..03ceb21a 100644 --- a/.run/Traefik and DB.run.xml +++ b/.run/Traefik.run.xml @@ -1,9 +1,9 @@ - + diff --git a/.run/tracking.run.xml b/.run/tracking.run.xml deleted file mode 100644 index fd0aa6d2..00000000 --- a/.run/tracking.run.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - \ No newline at end of file diff --git a/Makefile b/Makefile index 9f7241e7..388945f4 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ export all: | setup start -setup: .venv setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation setup-tracking +setup: .venv setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation .venv: python -m venv .venv @@ -29,10 +29,6 @@ setup-segmentation: $(info Building segmentation) cd segmentation && $(MAKE) -setup-tracking: - $(info Building tracking) - cd tracking && $(MAKE) - start-clustering: setup-clustering $(info Starting clustering) $(MAKE) -C clustering start @@ -49,17 +45,13 @@ start-segmentation: setup-segmentation $(info Starting segmentation) $(MAKE) -C segmentation start -start-tracking: setup-tracking - $(info Starting tracking) - $(MAKE) -C tracking start - -start-traefik-db: - $(info Starting traefik and db) - docker-compose -f docker-compose-traefik-db.yml up +start-traefik: + $(info Starting traefik) + docker-compose -f docker-compose-traefik.yml up start: $(info Starting all services) - $(MAKE) -j6 start-clustering start-embedding start-load-balancer start-segmentation start-tracking start-traefik-db + $(MAKE) -j6 start-clustering start-embedding start-load-balancer start-segmentation start-traefik clean: rm -rf .venv @@ -68,6 +60,5 @@ clean: cd embedding && $(MAKE) clean cd load-balancer && $(MAKE) clean cd segmentation && $(MAKE) clean - cd tracking && $(MAKE) clean -.PHONY: all setup setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation setup-tracking start-clustering start-embedding start-load-balancer start-segmentation start-tracking start-traefik-db start clean \ No newline at end of file +.PHONY: all setup setup-protobuf setup-clustering setup-embedding setup-load-balancer setup-segmentation start-clustering start-embedding start-load-balancer start-segmentation start-traefik start clean \ No newline at end of file diff --git a/README.md b/README.md index 34906b07..45585e33 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ which initializes virtual environments, installs dependencies and downloads required models. After that the services will be started automatically. -There is one special target in the `Makefile` that will start traefik and the MongoDB database in a docker container +There is one special target in the `Makefile` that will start traefik in a docker container to redirect to the services running on your local machine. You can always just directly use `make` and it will automatically detect changed dependencies. @@ -87,7 +87,6 @@ If you are using PyCharm, you can configure the project as follows: \- `embedding` \- `load-balancer` \- `segmentation` - \- `tracking` 5. Configure the virtual environment Python interpreters for the different modules: For each of the modules in the list above, go to `File -> Settings -> Project: Athena -> Project Interpreter` and select the virtual environment in the `.venv` directory of the respective module. @@ -115,8 +114,6 @@ The following API-routes are available after start: * - For the computation components to query tasks from the load balancer * - For the computation components to send back results to the load balancer * - For Artemis to upload course material -* - For Artemis to access tracking functionality -* - For Artemis to access feedback\_consistency functionality Traefik provides a dashboard to monitor the status of underlying components. This dashboard is available on by default. diff --git a/benchmark/__init__.py b/benchmark/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt deleted file mode 100644 index 495a63a9..00000000 --- a/benchmark/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -enum34==1.1.6 -ipaddr==2.2.0 -ipaddress==1.0.17 -requests==2.31.0 -urllib3==1.26.18 -virtualenv==15.1.0 -websocket-client==0.44.0 - -pyocclient==0.4 - -nltk==3.6.6 diff --git a/benchmark/src/__init__.py b/benchmark/src/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmark/src/data/__init__.py b/benchmark/src/data/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmark/src/data/data_retriever.py b/benchmark/src/data/data_retriever.py deleted file mode 100644 index 34c525f5..00000000 --- a/benchmark/src/data/data_retriever.py +++ /dev/null @@ -1,67 +0,0 @@ -import pandas as pd -from pandas import read_csv -from pathlib import Path -from benchmark.src.entities.text_block import TextBlock -from benchmark.src.entities.feedback_with_text_block import FeedbackWithTextBlock -import itertools - -__cwd = Path.cwd() -PATH_LABELED_SUBMISSIONS = (__cwd / "data/resources/text_block.csv").resolve() -PATH_TEXT_BLOCKS = (__cwd / "data/resources/ArTEMiS_text_block.csv").resolve() -PATH_FEEDBACK = (__cwd / "data/resources/ArTEMiS_feedback.csv").resolve() -PATH_FEEDBACK_CONSISTENCY = (__cwd / "data/resources/feedback.csv").resolve() -PATH_FEEDBACK_CONSISTENCY_OUTPUT = (__cwd / "data/resources/feedback_inconsistencies.csv").resolve() - - -def read_labeled_sentences_from_csv(num_sentences=None): - submissions = read_csv(PATH_LABELED_SUBMISSIONS) - submissions = submissions[~submissions["manual_cluster_id"].isnull()] - sentences = submissions[["text"]].values.flatten() - ground_truth_clusters = submissions[["manual_cluster_id"]].values.flatten() - ids = submissions[["id"]].values.flatten() - if num_sentences is None: - num_sentences = len(sentences) - else: - num_sentences = min(num_sentences, len(sentences)) - return [TextBlock(sentences[i], ground_truth_cluster=ground_truth_clusters[i], id=ids[i]) for i in - range(num_sentences)] - - -def read_sentences_feedback_from_csv(num_sentences=None): - text_blocks_csv = read_csv(PATH_TEXT_BLOCKS) - feedback_csv = read_csv(PATH_FEEDBACK) - result = pd.merge(text_blocks_csv, feedback_csv, left_on="id", right_on="reference") - result = result[~result["points"].isnull()] - result = result[~result["text"].isnull()] - ids = result[["id"]].values.flatten() - text_blocks = result[["text"]].values.flatten() - points = result[["points"]].values.flatten() - if num_sentences is None: - num_sentences = len(text_blocks) - else: - num_sentences = min(num_sentences, len(text_blocks)) - return [TextBlock(text_blocks[i], ground_truth_grade=points[i], id=ids[i]) for i in range(num_sentences)] - - -def read_feedback_consistency_from_csv(): - data = read_csv(PATH_FEEDBACK_CONSISTENCY, sep=";", keep_default_na=False) - feedback_ids = data[["feedback_id"]].values.flatten() - feedback_texts = data[["feedback_text"]].values.flatten() - feedback_scores = data[["score"]].values.flatten() - references = data[["reference"]].values.flatten() - ids = data[["textblock_id"]].values.flatten() - texts = data[["textblock_text"]].values.flatten() - submission_ids = data[["submission_id"]].values.flatten() - cluster_ids = data[["cluster_id"]].values.flatten() - blocks = [FeedbackWithTextBlock(textblock_id=ids[i], submission_id=submission_ids[i], cluster_id=cluster_ids[i], - text=texts[i], feedback_id=feedback_ids[i], feedback_score=feedback_scores[i], - feedback_text=feedback_texts[i], reference=references[i]) for i in - range(len(data)) if feedback_texts[i] and cluster_ids[i] and texts[i] and not feedback_texts[i] == ' '] - return [list(i) for j, i in - itertools.groupby(sorted(blocks, key=lambda x: x.submission_id), lambda x: x.submission_id)] - - -def write_feedback_inconsistencies_to_csv(inconsistencies): - df = pd.DataFrame(list(itertools.chain.from_iterable(inconsistencies)), - columns=['firstFeedbackId', 'secondFeedbackId', 'type']) - df.to_csv(PATH_FEEDBACK_CONSISTENCY_OUTPUT, index=False, header=True) diff --git a/benchmark/src/entities/__init__.py b/benchmark/src/entities/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmark/src/entities/cluster.py b/benchmark/src/entities/cluster.py deleted file mode 100644 index 9c2b819e..00000000 --- a/benchmark/src/entities/cluster.py +++ /dev/null @@ -1,32 +0,0 @@ - - -class Cluster: - - def __init__(self, id, block_ids, probabilities, distances): - self.id = id - self.block_ids = block_ids - self.probabilities = probabilities - self.distances = distances - - def contains_block(self, block_id): - return block_id in self.block_ids - - def probability_of_block(self, block_id): - if not self.contains_block(block_id): - return 0.0 - else: - index = self.block_ids.index(block_id) - return self.probabilities[index] - - @staticmethod - def clusters_from_network_response(response): - clusters = [] - for id, cluster_data in response.items(): - block_ids = [block["id"] for block in cluster_data["blocks"]] - clusters.append(Cluster(id, block_ids, cluster_data["probabilities"], cluster_data["distanceMatrix"])) - return clusters - - def __str__(self): - return "Cluster {} with blocks {}".format(self.id, self.block_ids) - - diff --git a/benchmark/src/entities/feedback_with_text_block.py b/benchmark/src/entities/feedback_with_text_block.py deleted file mode 100644 index 9936b613..00000000 --- a/benchmark/src/entities/feedback_with_text_block.py +++ /dev/null @@ -1,23 +0,0 @@ - -class FeedbackWithTextBlock: - - def __init__(self, feedback_id: int, feedback_text: str, feedback_score: float, reference: str, textblock_id: str, text: str, - submission_id: int, cluster_id: int): - self.id = textblock_id - self.submission_id = submission_id - self.cluster_id = cluster_id - self.text = text - self.feedback_id = feedback_id - self.feedback_text = feedback_text - self.feedback_score = feedback_score - self.reference = reference - - def json_rep(self): - return { - 'textBlockId': self.id, - 'clusterId': str(self.cluster_id), - 'text': self.text, - 'feedbackId': str(self.feedback_id), - 'feedbackText': self.feedback_text, - 'credits': self.feedback_score - } diff --git a/benchmark/src/entities/text_block.py b/benchmark/src/entities/text_block.py deleted file mode 100644 index 40dd9899..00000000 --- a/benchmark/src/entities/text_block.py +++ /dev/null @@ -1,52 +0,0 @@ - - -class TextBlock: - __last_id = 0 - - def __init__(self, text, id=None, ground_truth_cluster=0, ground_truth_grade=None): - self.text = text - self.original_text = text - self.ground_truth_cluster = ground_truth_cluster - self.ground_truth_grade = ground_truth_grade - self.cluster = 0 - self.embedding = None - self.grade_from_cluster = None - self.probability_in_cluster = None - if id is None: - TextBlock.__last_id = TextBlock.__last_id + 1 - self.id = TextBlock.__last_id - else: - self.id = id - TextBlock.__last_id = id - - def __str__(self): - self.text.__str__() - - def json_rep(self): - return { - 'id': self.id, - 'text': self.text - } - - def extract_cluster(self, clusters: list): - self.cluster = [cluster for cluster in clusters if cluster.contains_block(self.id)][0] - self.probability_in_cluster = self.cluster.probability_of_block(self.id) - - def extract_embedding(self, embeddings: list): - self.embedding = [embedding['vector'] for embedding in embeddings if embedding['id'] == self.id][0] - - def similar(self, other): - return self.cluster.id == other.cluster.id - - def ground_truth_similar(self, other): - return self.ground_truth_cluster == other.ground_truth_cluster - - def compute_grade_from_cluster(self, text_blocks): - if self.cluster is None: - raise Exception("cluster for text block {} not defined".format(self.id)) - cluster_grades = [block.ground_truth_grade for block in text_blocks if self.similar(block)] - self.grade_from_cluster = sum(cluster_grades) / float(len(cluster_grades)) - - @staticmethod - def from_sentences(sentences): - return [TextBlock(sentence) for sentence in sentences] diff --git a/benchmark/src/main.py b/benchmark/src/main.py deleted file mode 100644 index 8335d0f4..00000000 --- a/benchmark/src/main.py +++ /dev/null @@ -1,90 +0,0 @@ -import logging -import sys -import matplotlib.pyplot as plt -from benchmark.src.data.data_retriever import read_labeled_sentences_from_csv, read_sentences_feedback_from_csv, \ - read_feedback_consistency_from_csv, write_feedback_inconsistencies_to_csv -from benchmark.src.entities.cluster import Cluster -from benchmark.src.entities.text_block import TextBlock -from benchmark.src.networking.api_services import * -from benchmark.src.plotting import plot_embeddings -from benchmark.src.similarity_measure import PrecisionRecallSimilarity, GradeBasedSimilarity - -__logger = getLogger(__name__) - - -def process_text_blocks(text_blocks, courseId=None, plot=True, log_clusters=False): - embeddings = embed(text_blocks, courseId=courseId) - clusters = Cluster.clusters_from_network_response(cluster(embeddings)) - for text_block in text_blocks: - text_block.extract_cluster(clusters) - text_block.extract_embedding(embeddings) - if plot: - plot_embeddings(text_blocks) - if log_clusters: - cluster_to_text = ["cluster {}: {}".format(textblock.cluster.id, textblock.original_text) for textblock in - text_blocks] - cluster_to_text.sort() - for result in cluster_to_text: - logger.info(result + "\n") - return text_blocks - - -def evaluate_by_labeled_sentences(courseId=None): - text_blocks = read_labeled_sentences_from_csv() - text_blocks = process_text_blocks(text_blocks, courseId) - similarity_measure = PrecisionRecallSimilarity(text_blocks) - __logger.info("similarity labeled data for course {}".format(courseId)) - similarity_measure.output_results() - - -def evaluate_by_artemis_data(courseId=None): - text_blocks = read_sentences_feedback_from_csv(num_sentences=1000) - text_blocks = process_text_blocks(text_blocks, courseId) - similarity_measure = GradeBasedSimilarity(text_blocks) - __logger.info("similarity grade-based for course {}".format(courseId)) - similarity_measure.output_results() - - -def plot_sentences(sentences, courseId=None): - text_blocks = [TextBlock(sentence) for sentence in sentences] - process_text_blocks(text_blocks, courseId, plot=True) - - -def feedback_consistency_test(exercise_id): - data = read_feedback_consistency_from_csv() - inconsistencies = check_feedback_consistency(feedback_with_text_blocks=data, exercise_id=exercise_id) - write_feedback_inconsistencies_to_csv(inconsistencies) - - -if __name__ == "__main__": - logger = logging.getLogger() - logger.setLevel(logging.DEBUG) - - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(logging.DEBUG) - formatter = logging.Formatter('[%(asctime)s] [%(process)d] [%(levelname)s] [%(name)s] %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - - sentences = [ - "class diagram depicts the structure of the system", - "class diagram is a system model", - "one of the system models is a class diagram", - "the structure of the system are represented in a class diagram", - "class diagrams contain classes and relations between them ", - "class diagram is a UML model", - "a diagram was presented in class", - "we didn't deal with diagrams in class ", - "Diagrams are part of this class", - "This is a first class flight", - "there are different classes of diagrams", - "I booked first class seat on the train", - ] - - feedback_consistency_test('1') - - evaluate_by_labeled_sentences(1478643) - evaluate_by_labeled_sentences(81) - evaluate_by_labeled_sentences() - - plt.show() diff --git a/benchmark/src/networking/__init__.py b/benchmark/src/networking/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmark/src/networking/api_requests.py b/benchmark/src/networking/api_requests.py deleted file mode 100644 index 8ae2c752..00000000 --- a/benchmark/src/networking/api_requests.py +++ /dev/null @@ -1,15 +0,0 @@ -from logging import getLogger - -import requests - -__logger = getLogger(__name__) - - -def post(api_endpoint, data): - response = requests.post(url=api_endpoint, json=data) - - if not response: - __logger.error("POST failed on {}: Status Code: {}".format(api_endpoint, response.status_code)) - return None - - return response.json() if response.status_code != 204 else None diff --git a/benchmark/src/networking/api_services.py b/benchmark/src/networking/api_services.py deleted file mode 100644 index 2d371a76..00000000 --- a/benchmark/src/networking/api_services.py +++ /dev/null @@ -1,60 +0,0 @@ -from logging import getLogger -from benchmark.src.networking.api_requests import post -import numpy as np - -__logger = getLogger(__name__) - -SEGMENTATION_URL = "http://localhost:8000/segment" -EMBEDDING_URL = "http://localhost:8001/embed" -CLUSTERING_URL = "http://localhost:8002/cluster" -FEEDBACK_CONSISTENCY_URL = "http://localhost:8001/feedback_consistency" - - -def segment(submissions, keywords=None): - # request with {"submissions":[{id:,text:}],"keywords":[]} - # response with {"keywords":[],"textBlocks":[{id:,startIndex:,endIndex}]} - request = {"submissions": submissions} - if keywords is not None: - request["keywords"] = keywords - return post(SEGMENTATION_URL, request) - - -def __check_feedback_consistency(feedback_with_text_block, exerciseId): - # request with {"feedbackWithTextBlock":[{'textBlockId':,'clusterId':,'text':,'feedbackId':,'feedbackText':,'credits':}]} - # {"feedbackInconsistencies":[{'firstFeedbackId':,'secondFeedbackId':,'type':]} - request = {"feedbackWithTextBlock": feedback_with_text_block, "exerciseId": exerciseId} - return post(FEEDBACK_CONSISTENCY_URL, request) - - -def check_feedback_consistency(feedback_with_text_blocks, exercise_id): - inconsistencies = [] - for fwt in feedback_with_text_blocks: - feedback_with_text_block = [block.json_rep() for block in fwt] - response = __check_feedback_consistency(feedback_with_text_block, exercise_id) - if response['feedbackInconsistencies']: - inconsistencies.append(response['feedbackInconsistencies']) - return np.array(inconsistencies).flatten().tolist() - - -def __embed(text_blocks, courseId=None): - # request with {'courseId': 25, 'blocks': [{'id': 1, 'text': 'this is the first block'}, {'id': 2, 'text': 'this is the second block'}]} - # response with { 'embeddings': [{'id': , 'vector':[]}] } - request = {"blocks": [text_block.json_rep() for text_block in text_blocks]} - if courseId is not None: - request["courseId"] = courseId - return post(EMBEDDING_URL, request)['embeddings'] - - -def embed(text_blocks, courseId=None): - split_text_blocks = [text_blocks] - if len(text_blocks) > 50: - split_text_blocks = np.array_split(np.array(text_blocks), len(text_blocks) / 50) - embeddings = list(map(lambda blocks: __embed(blocks, courseId), split_text_blocks)) - return [embedding for embedding_list in embeddings for embedding in embedding_list] - - -def cluster(embeddings): - # request with { "embeddings": [{"id": ,"vector":[]}] } - # response with {"clusters": {"-1": {"blocks": [{"id": 1}, {"id": 2}], "probabilities": [0.0, 0.0], "distanceMatrix": [[0.0, 0.22923004776660816], [0.22923004776660816, 0.0]]}}} - request = {"embeddings": embeddings} - return post(CLUSTERING_URL, request)['clusters'] diff --git a/benchmark/src/plotting.py b/benchmark/src/plotting.py deleted file mode 100644 index 9477286a..00000000 --- a/benchmark/src/plotting.py +++ /dev/null @@ -1,74 +0,0 @@ -import matplotlib.pyplot as plt -import seaborn as sns -from sklearn.manifold import TSNE - -from benchmark.src.entities.text_block import TextBlock - -tsne = TSNE(n_components=2, random_state=0) - - -def reduce_dimensions(vectors): - return tsne.fit_transform(vectors) - - -def plot_embeddings(textblocks: [TextBlock], persist_labels=False): - textblocks = [textblock for textblock in textblocks if int(textblock.cluster.id) >= -1] - vectors = [textblock.embedding for textblock in textblocks] - texts = [textblock.original_text for textblock in textblocks] - clusters = [int(textblock.cluster.id) for textblock in textblocks] - # clusters = [int(textblock.ground_truth_cluster) for textblock in textblocks] - probabilities = [textblock.probability_in_cluster for textblock in textblocks] - vectors = reduce_dimensions(vectors) - - color_palette = sns.color_palette('deep', max(clusters) + 1) - cluster_colors = [color_palette[x] if x >= 0 - else (0.5, 0.5, 0.5) - for x in clusters] - cluster_member_colors = [sns.desaturate(x, p) for x, p in - zip(cluster_colors, probabilities)] - - x = vectors[:, 0] - y = vectors[:, 1] - labels = texts - colors = cluster_member_colors - - norm = plt.Normalize(1, 4) - fig, ax = plt.subplots() - sc = plt.scatter(x, y, c=colors, s=100, norm=norm) - # plt.xlim(-200, 250) - # plt.xlim(-200, 250) - - if persist_labels : - for i in range(len(x)): - annotation = ax.annotate("", xy=(x[i], y[i]), xytext=(20, 20), textcoords="offset points", - bbox=dict(boxstyle="round", fc="w"), - arrowprops=dict(arrowstyle="->")) - annotation.set_text(texts[i]) - annotation.get_bbox_patch().set_alpha(0.4) - annotation.set_visible(True) - else: - annotation = ax.annotate("", xy=(0, 0), xytext=(20, 20), textcoords="offset points", - bbox=dict(boxstyle="round", fc="w"), - arrowprops=dict(arrowstyle="->")) - annotation.set_visible(False) - - def update_annot(ind): - pos = sc.get_offsets()[ind["ind"][0]] - annotation.xy = pos - text = "{}".format(" ".join([labels[n] for n in ind["ind"]])) - annotation.set_text(text) - annotation.get_bbox_patch().set_alpha(0.4) - - def hover(event): - vis = annotation.get_visible() - if event.inaxes == ax: - cont, ind = sc.contains(event) - if cont: - update_annot(ind) - annotation.set_visible(True) - fig.canvas.draw_idle() - else: - if vis: - annotation.set_visible(False) - fig.canvas.draw_idle() - fig.canvas.mpl_connect("motion_notify_event", hover) diff --git a/benchmark/src/similarity_measure.py b/benchmark/src/similarity_measure.py deleted file mode 100644 index a7354bf6..00000000 --- a/benchmark/src/similarity_measure.py +++ /dev/null @@ -1,68 +0,0 @@ -from abc import ABC, abstractmethod -from logging import getLogger - - -class SimilarityMeasure(ABC): - @abstractmethod - def output_results(self): - pass - - -class PrecisionRecallSimilarity(SimilarityMeasure): - __logger = getLogger(__name__) - - def __init__(self, text_blocks): - self.text_blocks = text_blocks - self.false_negatives = 0 - self.false_positives = 0 - self.true_negatives = 0 - self.true_positives = 0 - - for text_block in text_blocks: - for other in text_blocks: - if text_block.similar(other) and text_block.ground_truth_similar(other): - self.true_positives += 1 - if not (text_block.similar(other)) and not (text_block.ground_truth_similar(other)): - self.true_negatives += 1 - if text_block.similar(other) and not (text_block.ground_truth_similar(other)): - self.false_positives += 1 - if not (text_block.similar(other)) and text_block.ground_truth_similar(other): - self.false_negatives += 1 - - self.precision = self.true_positives / (1.0 * (self.true_positives + self.false_positives)) - self.recall = self.true_positives / (1.0 * (self.true_positives + self.false_negatives)) - self.f1_score = 2 * ((self.precision * self.recall) / (self.precision + self.recall)) - - def output_results(self): - self.__logger.info('The achieved precision is {}'.format(self.precision)) - self.__logger.info('The achieved recall is {}'.format(self.recall)) - self.__logger.info('The achieved F1_score is {}'.format(self.f1_score)) - - -class GradeBasedSimilarity(SimilarityMeasure): - __logger = getLogger(__name__) - - def __init__(self, text_blocks): - for text_block in text_blocks: - text_block.compute_grade_from_cluster(text_blocks) - self.text_blocks = text_blocks - self.l1_loss = sum( - [abs((text_block.grade_from_cluster - text_block.ground_truth_grade)) for text_block in text_blocks]) / \ - len(text_blocks) - - def output_results(self): - self.__logger.info('The L1 loss for the model is {}'.format(self.l1_loss)) - max_over_graded = max(self.text_blocks, - key=lambda text_block: text_block.grade_from_cluster - text_block.ground_truth_grade) - self.__logger.info( - "The most over-graded sentence is \"{}\". \n Assigned:{} but ground truth: {}".format( - max_over_graded.original_text, - max_over_graded.grade_from_cluster, - max_over_graded.ground_truth_grade)) - max_under_graded = max(self.text_blocks, - key=lambda text_block: text_block.ground_truth_grade - text_block.grade_from_cluster) - self.__logger.info( - "The most under-graded sentence is \"{}\". \n Assigned:{} but ground truth: {}".format( - max_under_graded.original_text, - max_under_graded.grade_from_cluster, - max_under_graded.ground_truth_grade)) diff --git a/benchmark/src/test/__init__.py b/benchmark/src/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmark/src/test/test_clustering.py b/benchmark/src/test/test_clustering.py deleted file mode 100644 index 7acd0cf8..00000000 --- a/benchmark/src/test/test_clustering.py +++ /dev/null @@ -1,45 +0,0 @@ -from unittest import TestCase - -from benchmark.src.entities.text_block import TextBlock -from benchmark.src.networking.api_services import cluster, embed - -sentences_flowers = ["A second flower blossomed and remained.", - "I have red and yellow flowers", - "flowers and roses are beautiful", - "She picked the flower up and smelled it"] -sentences_software = ["this is the clustering component of the text assessment software engineering project", - "In software engineering, a software design pattern is a general, reusable solution to a commonly occurring problem within a given context in software design.", - "Patterns in software engineering is a lecture at TUM", - "Software engineering is defined as a process of analyzing user requirements and then designing, building, and testing software"] -sentences_law = ["the congress decided against this law", - "I want to study law and become lawyer", - "you can't brake the law like this", - "Law breaking is usually punished with jail"] - -embeddings_flowers = embed(TextBlock.from_sentences(sentences_flowers)) -embeddings_software = embed(TextBlock.from_sentences(sentences_software)) -embeddings_law = embed(TextBlock.from_sentences(sentences_law)) - - -class TestClustering(TestCase): - - def test_cluster_same_sentences(self): - embeddings_software_repeated = embed(TextBlock.from_sentences([sentences_software[0]] * 5)) - clusters = cluster(embeddings_software_repeated) - print(clusters) - self.assertEqual(1, len(clusters)) - - def test_cluster_similar_sentences(self): - clusters = cluster(embeddings_flowers) - self.assertEqual(1, len(clusters)) - - clusters = cluster(embeddings_software) - self.assertEqual(1, len(clusters)) - - clusters = cluster(embeddings_law) - self.assertEqual(1, len(clusters)) - - def test_cluster_different_topics(self): - clusters = cluster(embeddings_flowers+embeddings_software+embeddings_law) - # test: there are 3 different clusters - self.assertEqual(3, len(clusters)) diff --git a/clustering/requirements.txt b/clustering/requirements.txt index 90016305..6c0dd343 100644 --- a/clustering/requirements.txt +++ b/clustering/requirements.txt @@ -5,7 +5,6 @@ matplotlib==3.7.0 numpy==1.24.2 pandas==1.5.3 pydantic==1.10.4 -pymongo==4.3.3 requests==2.31.0 scikit-learn==1.2.1 scipy==1.10.0 diff --git a/clustering/src/database/Connection.py b/clustering/src/database/Connection.py deleted file mode 100644 index 91d1b3d8..00000000 --- a/clustering/src/database/Connection.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -import pymongo - - -# this class contains most of the important collection level pymongo operations but not all of them -# for the whole list and detailed explanations - https://api.mongodb.com/python/current/api/pymongo/collection.html -class Connection: - - def __init__(self): - # Get container variables for datbase connection - dbhost = str(os.environ['DATABASE_HOST']) if "DATABASE_HOST" in os.environ else "database" - dbport = int(os.environ['DATABASE_PORT']) if "DATABASE_PORT" in os.environ else 27017 - dbname = str(os.environ['DATABASE_NAME']) if "DATABASE_NAME" in os.environ else "athene_db" - dbuser = str(os.environ['CLUSTERING_DATABASE_USER']) if "CLUSTERING_DATABASE_USER" in os.environ else "clustering" - dbpwd = str(os.environ['CLUSTERING_DATABASE_PWD']) if "CLUSTERING_DATABASE_PWD" in os.environ else "clustering_password" - self.client = pymongo.MongoClient(host=dbhost, port=dbport, username=dbuser, password=dbpwd, - authSource=dbname) - self.db = self.client[dbname] - self.collection = None - - # inserts one document to a collection - # collection {string} - collection name to store the document - # document {field-value pairs} - e.g. {'x': 1, 'y': "apples"} - def insert_document(self, collection, document): - try: - self.collection = self.db[collection] - self.collection.insert_one(document) - except Exception as e: - print(e) - - # inserts an array of documents to a collection - # collection {string} - collection name to store the document - # document {array} - e.g. [{'x': 1, 'y': "apples"}, {'x': 15, 'y': "oranges", 'z': 40.5}] - def insert_documents(self, collection, documents: []): - try: - self.collection = self.db[collection] - self.collection.insert_many(documents) - except Exception as e: - print(e) - - # query database and returns results - # filter_dict {field-value pairs} - specifies elements which must be present in the resulting set - # projection {field-value pairs} - list of field names should be included or excluded in the resulting set. e.g. {‘_id’: False} _id values will be excluded in the resulting set - # skip {int} - number of documents to omit (from the start of the result set) when returning the results - # limit {int} - max number of results to return - # max_time_ms {int} - Specifies a time limit for a query operation. If the specified time is exceeded, the operation will be aborted - def find_documents(self, collection, filter_dict, projection=None, skip=0, limit=0, max_time_ms=None): - try: - self.collection = self.db[collection] - docs = self.collection.find(filter=filter_dict, projection=projection, skip=skip, limit=limit, - max_time_ms=max_time_ms) - except Exception as e: - print(e) - else: - return docs - - # update a document matching the filter - # filter_dict {field-value pairs} - find the document to update e.g. {'x': 1} - # update_dict {field-value pairs} - modifications to apply e.g. {'$set': {'x': 3}} - # upsert {boolean} - if true performs insert when no documents match the filter - # Note: For the full list of update parameters https://docs.mongodb.com/manual/reference/operator/update/ - def update_document(self, collection, filter_dict, update_dict, upsert=False): - try: - self.collection = self.db[collection] - result = self.collection.update_one(filter_dict, update_dict, upsert) - except Exception as e: - print(e) - else: - return result - - # updates one or more documents matching the filter - def update_documents(self, collection, filter_dict, update_dict, upsert=False): - try: - self.collection = self.db[collection] - result = self.collection.update_many(filter_dict, update_dict, upsert) - except Exception as e: - print(e) - else: - return result - - # deletes one document matching the filter - def delete_document(self, collection, filter_dict): - try: - self.collection = self.db[collection] - result = self.collection.delete_one(filter_dict) - except Exception as e: - print(e) - else: - return result - - # deletes one or more documents matching the filter - def delete_documents(self, collection, filter_dict): - try: - self.collection = self.db[collection] - result = self.collection.delete_many(filter_dict) - except Exception as e: - print(e) - else: - return result - - # counts the number of documents in collection matching the filter - def count_documents(self, collection, filter_dict): - try: - self.collection = self.db[collection] - result = self.collection.count_documents(filter_dict) - except Exception as e: - print(e) - else: - return result - - def get_collection_names(self): - return self.db.collection_names() diff --git a/clustering/src/database/__init__.py b/clustering/src/database/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docker-compose-traefik-db.yml b/docker-compose-traefik-db.yml deleted file mode 100644 index c41bce8f..00000000 --- a/docker-compose-traefik-db.yml +++ /dev/null @@ -1,30 +0,0 @@ -version: '3' - -# Use this docker file to only start Traefik and the database. -# You will have to start the microservices manually, see the individual READMEs for details. - -services: - traefik: - image: traefik:v2.9.6 - container_name: athene-traefik-local - restart: unless-stopped - ports: - - ${TRAEFIK_DASHBOARD_PORT}:8080 - - ${TRAEFIK_HTTP_PORT}:80 - volumes: - - ./traefik/traefik.local.yml:/etc/traefik/traefik.yml:ro - - ./traefik/traefik-dynamic.local.yml:/etc/traefik/traefik-dynamic.local.yml:ro - - database: - image: mongo:latest - container_name: athene-mongodb - restart: unless-stopped - expose: - - 27017 - environment: - - MONGO_INITDB_DATABASE=${DATABASE_NAME} - - MONGO_INITDB_ROOT_USERNAME=${DATABASE_ROOT_USERNAME} - - MONGO_INITDB_ROOT_PASSWORD=${DATABASE_ROOT_PASSWORD} - volumes: - - ./init-mongo.js:/docker-entrypoint-initdb.d/init-mongo.js:ro - - ./data/db:/data/db diff --git a/docker-compose-traefik.yml b/docker-compose-traefik.yml new file mode 100644 index 00000000..a7ba494a --- /dev/null +++ b/docker-compose-traefik.yml @@ -0,0 +1,16 @@ +version: '3' + +# Use this docker file to only start Traefik. +# You will have to start the microservices manually, see the individual READMEs for details. + +services: + traefik: + image: traefik:v2.9.6 + container_name: athene-traefik-local + restart: unless-stopped + ports: + - ${TRAEFIK_DASHBOARD_PORT}:8080 + - ${TRAEFIK_HTTP_PORT}:80 + volumes: + - ./traefik/traefik.local.yml:/etc/traefik/traefik.yml:ro + - ./traefik/traefik-dynamic.local.yml:/etc/traefik/traefik-dynamic.local.yml:ro diff --git a/docker-compose.override.yml b/docker-compose.override.yml index 5ecfff74..5df97877 100644 --- a/docker-compose.override.yml +++ b/docker-compose.override.yml @@ -45,10 +45,3 @@ services: clustering: volumes: - ./clustering/src:/usr/src/app/src - - tracking: - volumes: - - ./tracking/src:/usr/src/app/src - - # database: - # This component does not need to be changed diff --git a/docker-compose.yml b/docker-compose.yml index 9f5bb1f7..b25da03d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,6 @@ services: # These components have API endpoints managed by traefik - load-balancer - embedding - - tracking ports: - ${TRAEFIK_DASHBOARD_PORT}:8080 - ${TRAEFIK_HTTP_PORT}:80 @@ -82,15 +81,9 @@ services: restart: unless-stopped depends_on: - load-balancer - - database expose: - 8000 environment: - - DATABASE_HOST - - DATABASE_PORT - - DATABASE_NAME - - EMBEDDING_DATABASE_USER - - EMBEDDING_DATABASE_PWD - AUTHORIZATION_SECRET - BALANCER_QUEUE_FREQUENCY - BALANCER_GETTASK_URL @@ -118,15 +111,9 @@ services: restart: unless-stopped depends_on: - load-balancer - - database expose: - 8000 environment: - - DATABASE_HOST - - DATABASE_PORT - - DATABASE_NAME - - CLUSTERING_DATABASE_USER - - CLUSTERING_DATABASE_PWD - AUTHORIZATION_SECRET - BALANCER_QUEUE_FREQUENCY - BALANCER_GETTASK_URL @@ -137,52 +124,6 @@ services: labels: - traefik.enable=true - # http://localhost/tracking - tracking: - build: - context: . - dockerfile: ./tracking/Dockerfile - image: athene-tracking - container_name: athene-tracking - restart: unless-stopped - depends_on: - - database - expose: - - 8000 - environment: - - AUTHORIZATION_SECRET - - DATABASE_HOST - - DATABASE_PORT - - DATABASE_NAME - - TRACKING_DATABASE_USER - - TRACKING_DATABASE_PWD - working_dir: /usr/src/app - networks: - - athene - labels: - - traefik.enable=true - - traefik.http.routers.tracking.rule=PathPrefix(`/tracking`) - - traefik.http.routers.tracking.entrypoints=web - - traefik.http.routers.tracking-tls.rule=PathPrefix(`/tracking`) - - traefik.http.routers.tracking-tls.entrypoints=websecure - - traefik.http.routers.tracking-tls.tls=true - - database: - image: mongo:latest - container_name: athene-mongodb - restart: unless-stopped - expose: - - 27017 - environment: - - MONGO_INITDB_DATABASE=${DATABASE_NAME} - - MONGO_INITDB_ROOT_USERNAME=${DATABASE_ROOT_USERNAME} - - MONGO_INITDB_ROOT_PASSWORD=${DATABASE_ROOT_PASSWORD} - volumes: - - ./init-mongo.js:/docker-entrypoint-initdb.d/init-mongo.js:ro - - ./data/db:/data/db - networks: - - athene - networks: athene: driver: bridge diff --git a/embedding/requirements.txt b/embedding/requirements.txt index 1789ff6b..adcd696e 100644 --- a/embedding/requirements.txt +++ b/embedding/requirements.txt @@ -7,7 +7,6 @@ fastapi==0.95.2 joblib==1.2.0 nltk==3.8.1 numpy==1.20 # needs to be 1.20 because after that, np.int does not exist any more, which breaks nltk -pymongo==4.3.3 requests==2.31.0 scikit-learn==0.22 # needs to be 0.22 because after that, sklearn.utils.linear_assignment_ is removed, which breaks allennlp scipy==1.10.0 diff --git a/embedding/src/database/Connection.py b/embedding/src/database/Connection.py deleted file mode 100644 index 183b0fff..00000000 --- a/embedding/src/database/Connection.py +++ /dev/null @@ -1,121 +0,0 @@ -import os -import pymongo - - -# this class contains most of the important collection level pymongo operations but not all of them -# for the whole list and detailed explanations - https://api.mongodb.com/python/current/api/pymongo/collection.html -class Connection: - - def __init__(self): - # Get container variables for datbase connection - dbhost = str(os.environ['DATABASE_HOST']) if "DATABASE_HOST" in os.environ else "database" - dbport = int(os.environ['DATABASE_PORT']) if "DATABASE_PORT" in os.environ else 27017 - dbname = str(os.environ['DATABASE_NAME']) if "DATABASE_NAME" in os.environ else "athene_db" - dbuser = str(os.environ['EMBEDDING_DATABASE_USER']) if "EMBEDDING_DATABASE_USER" in os.environ else "embedding" - dbpwd = str(os.environ['EMBEDDING_DATABASE_PWD']) if "EMBEDDING_DATABASE_PWD" in os.environ else "embedding_password" - self.client = pymongo.MongoClient(host=dbhost, port=dbport, username=dbuser, password=dbpwd, - authSource=dbname) - self.db = self.client[dbname] - self.collection = None - - # inserts one document to a collection - # collection {string} - collection name to store the document - # document {field-value pairs} - e.g. {'x': 1, 'y': "apples"} - def insert_document(self, collection, document): - try: - self.collection = self.db[collection] - self.collection.insert_one(document) - except Exception as e: - raise e - - # inserts an array of documents to a collection - # collection {string} - collection name to store the document - # document {array} - e.g. [{'x': 1, 'y': "apples"}, {'x': 15, 'y': "oranges", 'z': 40.5}] - def insert_documents(self, collection, documents: []): - try: - self.collection = self.db[collection] - self.collection.insert_many(documents) - except Exception as e: - raise e - - # query database and returns results - # filter_dict {field-value pairs} - specifies elements which must be present in the resulting set - # projection {field-value pairs} - list of field names should be included or excluded in the resulting set. e.g. {‘_id’: False} _id values will be excluded in the resulting set - # skip {int} - number of documents to omit (from the start of the result set) when returning the results - # limit {int} - max number of results to return - # max_time_ms {int} - Specifies a time limit for a query operation. If the specified time is exceeded, the operation will be aborted - def find_documents(self, collection, filter_dict, projection=None, skip=0, limit=0, max_time_ms=None): - try: - self.collection = self.db[collection] - docs = self.collection.find(filter=filter_dict, projection=projection, skip=skip, limit=limit, - max_time_ms=max_time_ms) - except Exception as e: - raise e - else: - return docs - - # update a document matching the filter - # filter_dict {field-value pairs} - find the document to update e.g. {'x': 1} - # update_dict {field-value pairs} - modifications to apply e.g. {'$set': {'x': 3}} - # upsert {boolean} - if true performs insert when no documents match the filter - # Note: For the full list of update parameters https://docs.mongodb.com/manual/reference/operator/update/ - def update_document(self, collection, filter_dict, update_dict, upsert=False): - try: - self.collection = self.db[collection] - result = self.collection.update_one(filter_dict, update_dict, upsert) - except Exception as e: - raise e - else: - return result - - # updates one or more documents matching the filter - def update_documents(self, collection, filter_dict, update_dict, upsert=False): - try: - self.collection = self.db[collection] - result = self.collection.update_many(filter_dict, update_dict, upsert) - except Exception as e: - raise e - else: - return result - - def replace_document(self, collection, filter_dict, replacement_dict, upsert=False): - try: - self.collection = self.db[collection] - result = self.collection.replace_one(filter_dict, replacement_dict, upsert) - except Exception as e: - raise e - else: - return result - - # deletes one document matching the filter - def delete_document(self, collection, filter_dict): - try: - self.collection = self.db[collection] - result = self.collection.delete_one(filter_dict) - except Exception as e: - raise e - else: - return result - - # deletes one or more documents matching the filter - def delete_documents(self, collection, filter_dict): - try: - self.collection = self.db[collection] - result = self.collection.delete_many(filter_dict) - except Exception as e: - raise e - else: - return result - - # counts the number of documents in collection matching the filter - def count_documents(self, collection, filter_dict): - try: - self.collection = self.db[collection] - result = self.collection.count_documents(filter_dict) - except Exception as e: - raise e - else: - return result - - def get_collection_names(self): - return self.db.collection_names() diff --git a/embedding/src/database/__init__.py b/embedding/src/database/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/embedding/src/feedback/FeedbackCommentRequest.py b/embedding/src/feedback/FeedbackCommentRequest.py deleted file mode 100644 index eb42de5c..00000000 --- a/embedding/src/feedback/FeedbackCommentRequest.py +++ /dev/null @@ -1,44 +0,0 @@ -import json -from logging import getLogger -from fastapi import APIRouter, Request -from src.errors import invalidJson, requireFeedbackWithTextBlock, requireExerciseId -from src.entities import FeedbackWithTextBlock, Feedback -from src.feedback.FeedbackConsistency import FeedbackConsistency - -logger = getLogger(name="FeedbackCommentRequest") -router = APIRouter() - -@router.post("/feedback_consistency") -async def feedback(request: Request): - logger.debug("-" * 80) - logger.info("Start processing Feedback Comment Request:") - - # Parse json - try: - doc = await request.json() - except Exception as e: - logger.error("Exception while parsing json: {}".format(str(e))) - raise invalidJson - - logger.info("Request: {}".format(doc)) - if "feedbackWithTextBlock" not in doc: - logger.error("{}".format(requireFeedbackWithTextBlock.detail)) - raise requireFeedbackWithTextBlock - - if "exerciseId" not in doc: - logger.error("{}".format(requireExerciseId.detail)) - raise requireExerciseId - - blocks: list[FeedbackWithTextBlock] = [] - - for fwt in doc['feedbackWithTextBlock']: - blocks.append(FeedbackWithTextBlock(fwt['textBlockId'], fwt['clusterId'], fwt['text'], Feedback(fwt['feedbackId'], fwt['feedbackText'], fwt['credits']))) - - __fc = FeedbackConsistency(doc['exerciseId']) - response = __fc.check_consistency(feedback_with_text_blocks=blocks) - logger.info("Response {}".format(response)) - __fc.store_feedback() - - logger.info("Completed Feedback Comment Embedding Request.") - logger.debug("-" * 80) - return response diff --git a/embedding/src/feedback/FeedbackCommentResource.py b/embedding/src/feedback/FeedbackCommentResource.py deleted file mode 100644 index 05d96ee6..00000000 --- a/embedding/src/feedback/FeedbackCommentResource.py +++ /dev/null @@ -1,124 +0,0 @@ -import numpy as np -import os -import json -import pickle -import requests -from logging import getLogger -from typing import List -from src.elmo import ELMo -from src.database.Connection import Connection -from src.entities import FeedbackWithTextBlock, Feedback, Sentence, ElmoVector - -# Get container variable for segmentation url -SEGMENTATION_URL = str(os.environ['SEGMENTATION_URL']) if "SEGMENTATION_URL" in os.environ else "http://segmentation:8000/segment" - - -class FeedbackCommentResource: - __logger = getLogger(__name__) - __collection = 'feedback_consistency' - - def __init__(self, exercise_id): - self.__elmo = ELMo() - self.__conn = Connection() - self.__collection = 'feedback_consistency_' + (str(exercise_id) if exercise_id != -1 else 'test') - - def __segment_feedback_comments(self, feedback_with_tb: list): - self.__logger.info("Segment Feedback Comments.") - feedback = [] - for f in feedback_with_tb: - feedback.append({"id": f.feedback.id, "text": f.feedback.text}) - - request = {"feedback": feedback} - return self.post(SEGMENTATION_URL, request) - - def __embed_sentences(self, sentence: List[Sentence]): - return self.__elmo.embed_sentences(sentence) - - def __create_feedback_document(self, feedback_with_tb: FeedbackWithTextBlock): - embeddings = [] - for embedding in feedback_with_tb.feedback.feedbackEmbeddings: - embeddings.append({'embedding': pickle.dumps(np.array(embedding).flatten().tolist())}) - - doc = {'_id': feedback_with_tb.id, - 'cluster_id': feedback_with_tb.cluster_id, - 'text': feedback_with_tb.text, - 'text_embedding': pickle.dumps(np.array(feedback_with_tb.text_embedding).flatten().tolist()), - 'feedback': {'feedback_id': feedback_with_tb.feedback.id, - 'feedback_text': feedback_with_tb.feedback.text, - 'feedback_score': feedback_with_tb.feedback.score, - 'feedback_text_blocks': embeddings}} - - return doc - - def __replace_insert_documents(self, documents: []): - self.__logger.info("Replace-Insert Feedback.") - for doc in documents: - __filter = {'_id': doc['_id']} - try: - result = self.__conn.replace_document(collection=self.__collection, filter_dict=__filter, - replacement_dict=doc, upsert=True) - except Exception as e: - self.__logger.error(e) - else: - self.__logger.info( - "Modified Count: {} Upserted id {}".format(result.modified_count, result.upserted_id)) - - def embed_feedback(self, feedback_with_tb: list): - self.__logger.info("Embed Feedback.") - segmented_feedback_comments = self.__segment_feedback_comments(feedback_with_tb) - - for fwt in feedback_with_tb: - blocks = (blocks for blocks in segmented_feedback_comments['textBlocks'] if fwt.feedback.id == blocks['id']) - sentences: List[Sentence] = list(map(lambda b: fwt.feedback.text[b['startIndex']:b['endIndex']], blocks)) - vectors: List[ElmoVector] = self.__embed_sentences(sentences) - for v in vectors: - fwt.add_feedback_embedding(v) - - return feedback_with_tb - - def embed_feedback_text_blocks(self, feedback_with_tb: list): - sentences: List[Sentence] = list(map(lambda b: b.text, feedback_with_tb)) - vectors: List[ElmoVector] = self.__embed_sentences(sentences) - for fwt, vector in zip(feedback_with_tb, vectors): - fwt.text_embedding = vector - return feedback_with_tb - - def store_feedback(self, feedback_with_tb: list): - self.__logger.info("Store Feedback.") - - docs = [] - for fwt in feedback_with_tb: - docs.append(self.__create_feedback_document(feedback_with_tb=fwt)) - - self.__replace_insert_documents(documents=docs) - - def get_feedback_in_same_cluster(self, cluster_id: str, feedback_id: str): - self.__logger.info("Get feedback with same cluster id.") - _filter = {'$and': [{'cluster_id': cluster_id}, {'feedback.feedback_id': {'$ne': feedback_id}}]} - try: - result = self.__conn.find_documents(collection=self.__collection, filter_dict=_filter) - except Exception as e: - self.__logger.error(e) - return None - else: - return result - - def set_feedback_consistency_results(self, collection, doc): - try: - result = self.__conn.insert_document(collection=collection, document=doc) - except Exception as e: - self.__logger.error(e) - return None - else: - return result - - def post(self, api_endpoint, data): - response = requests.post(url=api_endpoint, json=data) - - if not response: - self.__logger.error("POST failed on {}: Status Code: {} Response: {}".format(api_endpoint, - response.status_code, - response.content)) - return None - - return json.loads(response.json()) diff --git a/embedding/src/feedback/FeedbackConsistency.py b/embedding/src/feedback/FeedbackConsistency.py deleted file mode 100644 index d5aa37f1..00000000 --- a/embedding/src/feedback/FeedbackConsistency.py +++ /dev/null @@ -1,71 +0,0 @@ -import pickle -import numpy as np -from logging import getLogger -from sklearn.metrics import pairwise_distances, silhouette_score -from src.feedback.FeedbackCommentResource import FeedbackCommentResource - - -class FeedbackConsistency: - __logger = getLogger(__name__) - __feedback_with_text_blocks: list = None - - def __init__(self, exercise_id): - self.__feedback_comment_resource = FeedbackCommentResource(exercise_id) - self.__comment_threshold = 0.37 - self.__text_block_threshold = 0.21 - - def __get_inconsistency(self, score_diff: float, comment_distance: float, text_block_distance: float): - if text_block_distance < self.__text_block_threshold: - if score_diff: - return 'INCONSISTENT_SCORE' if comment_distance < self.__comment_threshold else 'INCONSISTENT_FEEDBACK' - else: - return 'INCONSISTENT_COMMENT' if comment_distance > self.__comment_threshold else None - else: - return None - - def __calculate_distance_with_silhouette_score(self, x: [], y: []): - if len(x) < 2 and len(y) < 2: - distance = pairwise_distances(X=x, Y=y, metric='cosine').flatten()[0] - else: - samples = np.concatenate((x, y)) - labels = np.concatenate([np.full((1, len(x)), 1).flatten(), np.full((1, len(y)), 2).flatten()]) - distance = silhouette_score(X=samples, labels=labels, metric='cosine') - return distance - - def __calculate_mean_distance(self, x: [], y: []): - distance = pairwise_distances(X=x, Y=y, metric='cosine') - return np.mean(np.mean(distance, axis=1)) - - def check_consistency(self, feedback_with_text_blocks): - self.__logger.info("Check Consistencies") - # Find embeddings for each feedback comment - self.__feedback_with_text_blocks = self.__feedback_comment_resource.embed_feedback( - feedback_with_tb=feedback_with_text_blocks) - # Find embeddings for each student text - self.__feedback_with_text_blocks = self.__feedback_comment_resource.embed_feedback_text_blocks( - feedback_with_tb=feedback_with_text_blocks) - doc = [] - # Compare each new assessment with the ones in the database - for fwt in self.__feedback_with_text_blocks: - feedback_vector_x = fwt.feedback.feedbackEmbeddings - student_text_vector_x = fwt.text_embedding.reshape(1, -1).tolist() - # Get the assessments which have same the same cluster id - cluster = self.__feedback_comment_resource.get_feedback_in_same_cluster(cluster_id=fwt.cluster_id, - feedback_id=fwt.feedback.id) - # Calculate distances between each feedback embeddings and text block embeddings(student answers) - for item in cluster: - feedback_vector_y = list(map(lambda embedding: pickle.loads(embedding['embedding']), - item['feedback']['feedback_text_blocks'])) - student_text_vector_y = np.array(pickle.loads(item['text_embedding'])).reshape(1, -1).tolist() - feedback_distance = self.__calculate_mean_distance(x=feedback_vector_x, y=feedback_vector_y) - text_block_distance = self.__calculate_mean_distance(x=student_text_vector_x, y=student_text_vector_y) - inconsistency = self.__get_inconsistency( - score_diff=abs(fwt.feedback.score - item['feedback']['feedback_score']), - comment_distance=feedback_distance, text_block_distance=text_block_distance) - if inconsistency: - doc.append({"firstFeedbackId": fwt.feedback.id, "secondFeedbackId": item['feedback']['feedback_id'], "type": inconsistency}) - - return {'feedbackInconsistencies': doc} - - def store_feedback(self): - self.__feedback_comment_resource.store_feedback(self.__feedback_with_text_blocks) diff --git a/embedding/src/feedback/__init__.py b/embedding/src/feedback/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/embedding/src/main.py b/embedding/src/main.py index def5b6fd..74c5dec7 100644 --- a/embedding/src/main.py +++ b/embedding/src/main.py @@ -3,7 +3,6 @@ from fastapi import FastAPI, Request, Response, BackgroundTasks from src.TimerHandler import TimerHandler from src import UploadingResource -from src.feedback import FeedbackCommentRequest logger = logging.getLogger() logger.setLevel(logging.DEBUG) @@ -19,7 +18,6 @@ app = FastAPI() app.include_router(UploadingResource.router) -app.include_router(FeedbackCommentRequest.router) @app.post("/trigger") diff --git a/init-mongo.js b/init-mongo.js deleted file mode 100644 index 2a22120f..00000000 --- a/init-mongo.js +++ /dev/null @@ -1,37 +0,0 @@ -db.createUser( - { - user: "embedding", - pwd: "embedding_password", - roles: [ - { - role: "readWrite", - db: "athene_db" - } - ] - } -); -db.createUser( - { - user: "clustering", - pwd: "clustering_password", - roles: [ - { - role: "readWrite", - db: "athene_db" - } - ] - } -); - -db.createUser( - { - user: "tracking", - pwd: "tracking_password", - roles: [ - { - role: "readWrite", - db: "athene_db" - } - ] - } -); diff --git a/tracking/Dockerfile b/tracking/Dockerfile deleted file mode 100644 index 0cf2261f..00000000 --- a/tracking/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10 - -COPY ./tracking/requirements.txt requirements.txt -RUN pip install -r requirements.txt - -COPY ./tracking/src/ src/ - -EXPOSE 8000 -CMD uvicorn --host 0.0.0.0 --port 8000 src.main:app diff --git a/tracking/Makefile b/tracking/Makefile deleted file mode 100644 index b23f7b7b..00000000 --- a/tracking/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -#!make - -all: .venv - -.venv: requirements.txt - python -m venv .venv - source .venv/bin/activate; pip install -r requirements.txt - -start: - source .venv/bin/activate; python start.py - -clean: - rm -rf .venv - -.PHONY: all start clean \ No newline at end of file diff --git a/tracking/README.md b/tracking/README.md deleted file mode 100644 index e19e3d72..00000000 --- a/tracking/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Tracking Service - -## Start locally (without Docker) - -Locally, the service runs on port 8003. To start it, - -* first, run the following command for some preparations: - ```bash - make - ``` - This will create a virtual environment and install all dependencies. - -* After that, configure the used virtual environment: - ```bash - source venv/bin/activate - ``` - If you use an IDE, you can also configure the virtual environment there. - In PyCharm, you can even go to `File > Open`, choose the embedding folder - and then choose the `Attach` option. - -* Then, you can start the tracking server using `python start.py` or using your IDE. - -## Start with Docker - -Use the `docker-compose.yml` file from the parent directory -to start the embedding service (and all others) with Docker. diff --git a/tracking/requirements.txt b/tracking/requirements.txt deleted file mode 100644 index e551adc1..00000000 --- a/tracking/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -fastapi==0.90.0 -jaro-winkler==2.0.3 -numpy==1.24.2 -pandas==1.5.3 -PyJWT==2.6.0 -pymongo==4.3.3 -scikit-learn==1.2.1 -scipy==1.10.0 -uvicorn==0.20.0 diff --git a/tracking/src/__init__.py b/tracking/src/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tracking/src/database/Connection.py b/tracking/src/database/Connection.py deleted file mode 100644 index d0bf0dae..00000000 --- a/tracking/src/database/Connection.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import pymongo -import pandas as pd - - -# this class contains most of the important collection level pymongo operations but not all of them -# for the whole list and detailed explanations - https://api.mongodb.com/python/current/api/pymongo/collection.html -class Connection: - - def __init__(self): - # Get container variables for datbase connection - dbhost = str(os.environ['DATABASE_HOST']) if "DATABASE_HOST" in os.environ else "database" - dbport = int(os.environ['DATABASE_PORT']) if "DATABASE_PORT" in os.environ else 27017 - dbname = str(os.environ['DATABASE_NAME']) if "DATABASE_NAME" in os.environ else "athene_db" - dbuser = str(os.environ['TRACKING_DATABASE_USER']) if "TRACKING_DATABASE_USER" in os.environ else "tracking" - dbpwd = str(os.environ['TRACKING_DATABASE_PWD']) if "TRACKING_DATABASE_PWD" in os.environ else "tracking_password" - self.client = pymongo.MongoClient(host=dbhost, port=dbport, username=dbuser, password=dbpwd, - authSource=dbname) - self.db = self.client[dbname] - self.collection = None - - # inserts one document to a collection - # collection {string} - collection name to store the document - # document {field-value pairs} - e.g. {'x': 1, 'y': "apples"} - def insert_document(self, collection: str, document: dict): - self.collection = self.db[collection] - self.collection.insert_one(document) - - # inserts an array of documents to a collection - # collection {string} - collection name to store the document - # document {array} - e.g. [{'x': 1, 'y': "apples"}, {'x': 15, 'y': "oranges", 'z': 40.5}] - def insert_documents(self, collection: str, documents: [dict]): - try: - self.collection = self.db[collection] - self.collection.insert_many(documents) - except Exception as e: - print(e) - - # query database and returns results - # filter_dict {field-value pairs} - specifies elements which must be present in the resulting set - # projection {field-value pairs} - list of field names should be included or excluded in the resulting set. e.g. {‘_id’: False} _id values will be excluded in the resulting set - # skip {int} - number of documents to omit (from the start of the result set) when returning the results - # limit {int} - max number of results to return - # max_time_ms {int} - Specifies a time limit for a query operation. If the specified time is exceeded, the operation will be aborted - def find_documents(self, collection: str, filter_dict: dict, projection: dict = None, skip: int = 0, limit: int = 0, - max_time_ms: int = None): - try: - self.collection = self.db[collection] - docs = self.collection.find(filter=filter_dict, projection=projection, skip=skip, limit=limit, - max_time_ms=max_time_ms) - except Exception as e: - print(e) - else: - return docs - - # update a document matching the filter - # filter_dict {field-value pairs} - find the document to update e.g. {'x': 1} - # update_dict {field-value pairs} - modifications to apply e.g. {'$set': {'x': 3}} - # upsert {boolean} - if true performs insert when no documents match the filter - # Note: For the full list of update parameters https://docs.mongodb.com/manual/reference/operator/update/ - def update_document(self, collection: str, filter_dict: dict, update_dict: dict, upsert: bool = False): - try: - self.collection = self.db[collection] - result = self.collection.update_one(filter_dict, update_dict, upsert) - except Exception as e: - print(e) - else: - return result - - # updates one or more documents matching the filter - def update_documents(self, collection: str, filter_dict: dict, update_dict: dict, upsert: bool = False): - try: - self.collection = self.db[collection] - result = self.collection.update_many(filter_dict, update_dict, upsert) - except Exception as e: - print(e) - else: - return result - - # deletes one document matching the filter - def delete_document(self, collection: str, filter_dict: dict): - try: - self.collection = self.db[collection] - result = self.collection.delete_one(filter_dict) - except Exception as e: - print(e) - else: - return result - - # deletes one or more documents matching the filter - def delete_documents(self, collection: str, filter_dict: dict): - try: - self.collection = self.db[collection] - result = self.collection.delete_many(filter_dict) - except Exception as e: - print(e) - else: - return result - - # counts the number of documents in collection matching the filter - def count_documents(self, collection: str, filter_dict: dict): - try: - self.collection = self.db[collection] - result = self.collection.count_documents(filter_dict) - except Exception as e: - print(e) - else: - return result - - def get_collection_names(self): - return self.db.collection_names() - - def get_data_for_evaluation(self, exercise_id: int): - try: - self.collection = self.db.feedback - pipeline = [ - {'$match': {"participation.exercise.id": exercise_id}}, - {"$unwind": {'path': '$participation.results', 'preserveNullAndEmptyArrays': True}}, - {'$unwind': {'path': '$participation.results.feedbacks', 'preserveNullAndEmptyArrays': True}}, - {'$project': { - 'ID': '$ID', - 'pID': '$participation.id', - 'feedbacks': '$participation.results.feedbacks' - }}, - ] - - # df = pd.json_normalize(collection.find({"participation.exercise.id": 1830}, {"participation.results": 1})) - query_result = self.collection.aggregate(pipeline) - query_result = list(query_result) - df = pd.json_normalize(query_result) - - if len(df.index) == 0: - print(f'Exercise {exercise_id} was not tracked!') - return - - # sort feedback by textblock reference - df = df.sort_values('feedbacks.reference') - - # remove newline characters from feedbacks to prevent csv from breaking - df = df.replace('\n', ' ', regex=True) - - # write dataframe to csv - pd.DataFrame.to_csv(df, './similarity.csv', ';') - - return df - except Exception as e: - print(e) - else: - return result diff --git a/tracking/src/database/__init__.py b/tracking/src/database/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tracking/src/main.py b/tracking/src/main.py deleted file mode 100644 index 9d7de4a7..00000000 --- a/tracking/src/main.py +++ /dev/null @@ -1,251 +0,0 @@ -import base64 -from math import sqrt -from os import environ - -import jaro -import numpy as np -import pandas as pd -from enum import Enum -from fastapi import FastAPI, Request, Response, status -from jwt import decode -from sklearn import preprocessing -from sklearn.metrics import cohen_kappa_score - -from .database.Connection import Connection - -app = FastAPI() - -@app.post('/tracking/text-exercise-assessment', status_code=201) -async def track(request: Request, response: Response): - feedback = await request.json() - jwt_token = request.headers.get('x-athene-tracking-authorization') - secret_base64 = environ['AUTHORIZATION_SECRET'] - try: - encoded_jwt_token = decode(jwt_token, base64.b64decode(secret_base64), verify=True, algorithms=['HS256']) - if encoded_jwt_token.get('result_id') != feedback.get('participation').get('results')[0].get('id'): - response.status_code = status.HTTP_403_FORBIDDEN - return {'Please do not spam manually!'} - except Exception as e: - print(e) - response.status_code = status.HTTP_401_UNAUTHORIZED - return {'Your token is not valid!'} - try: - conn = Connection() - conn.insert_document('feedback', feedback) - except Exception as e: - print(e) - response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR - return {'message': 'Saving in the database did not work!'} - return {'Feedback is tracked successfully'} - - -@app.get('/tracking/exerciseId/{exercise_id}', status_code=200) -async def evaluate(exercise_id: int, response: Response): - try: - conn = Connection() - raw_data = conn.get_data_for_evaluation(exercise_id) - metrics = calculate_metrics(raw_data, exercise_id) - return metrics - except Exception as e: - print(e) - response.status_code = status.HTTP_404_NOT_FOUND - return {'message': 'There is no data for this exercise!'} - - -class FeedbackType(Enum): - Automatic = 1 - Typo = 2 - Extended = 3 - Different = 4 - - -def cohens_kappa(l1, l2): - # transfrom float values into distinc categories - enc = preprocessing.LabelEncoder() - enc.fit(np.hstack((l1, l2))) - - # calculate kappa - kappa_val = cohen_kappa_score(enc.transform(l1), enc.transform(l2)) - return kappa_val - - -def jaro_winkler(s1: str, s2: str): - dis = jaro.jaro_winkler_metric(s1, s2) - # print(f'Jaro-Winkler: {dis}') - return dis - - -def jaro_metric(s1: str, s2: str): - dis = jaro.jaro_metric(s1, s2) - # print(f'Jaro: {dis}') - return dis - - -# Calculates the normalized Levenshtein distance of 2 strings -def levenshtein(s1, s2): - l1 = len(s1) - l2 = len(s2) - matrix = [list(range(l1 + 1))] * (l2 + 1) - for zz in list(range(l2 + 1)): - matrix[zz] = list(range(zz, zz + l1 + 1)) - for zz in list(range(0, l2)): - for sz in list(range(0, l1)): - if s1[sz] == s2[zz]: - matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, matrix[zz][sz + 1] + 1, matrix[zz][sz]) - else: - matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, matrix[zz][sz + 1] + 1, matrix[zz][sz] + 1) - distance = float(matrix[l2][l1]) - result = 1.0 - distance / max(l1, l2) - # print(f'Levenshtein: {result}') - return result - - -# Dynamic Programming implementation of LCS problem - -def lcs(s1, s2): - # find the length of the strings - m = len(s1) - n = len(s2) - - # declaring the array for storing the dp values - L = [[None] * (n + 1) for i in range(m + 1)] - - """Following steps build L[m + 1][n + 1] in bottom up fashion - Note: L[i][j] contains length of LCS of X[0..i-1] - and Y[0..j-1]""" - for i in range(m + 1): - for j in range(n + 1): - if i == 0 or j == 0: - L[i][j] = 0 - elif s1[i - 1] == s2[j - 1]: - L[i][j] = L[i - 1][j - 1] + 1 - else: - L[i][j] = max(L[i - 1][j], L[i][j - 1]) - - # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1] - # print(f'LCS: {L[m][n] / min(len(s1), len(s2))} (absolute: {L[m][n]}; length_s1: {len(s1)}, length_s2: {len(s2)})') - - # prevent division by zero if LCS is 0 - if L[m][n] > 0: - return min(len(s1), len(s2)) / L[m][n] - else: - return 0 - - -def st_mean_diff(l1, l2): - mean_l1 = np.mean(l1) - mean_l2 = np.mean(l2) - std_l1 = np.std(l1) - std_l2 = np.std(l2) - - diff = abs((mean_l1 - mean_l2) / sqrt((std_l1 + std_l2) / 2)) - - return diff - - -def calculate_duration(start, end): - # calculate submission time - start_time = start.generation_time - end_time = end.generation_time - - timedelta = end_time - start_time - - return timedelta.total_seconds() - - -def classify_comment(s1: str, s2: str): - if s1 == s2: - return FeedbackType.Automatic - elif levenshtein(s1, s2) > 0.9: - return FeedbackType.Typo - elif lcs(s1, s2) > 0.95 and jaro_winkler(s1, s2) > 0.6: - return FeedbackType.Extended - else: - return FeedbackType.Different - - -def calculate_metrics(df: pd.DataFrame, exercise_id: int): - score_first_feedbacks = [] - score_last_feedbacks = [] - automatic_assessment_times = [] - manual_assessment_times = [] - assessed_participations = [] - - type_count = { - str(FeedbackType.Automatic): 0, - str(FeedbackType.Typo): 0, - str(FeedbackType.Extended): 0, - str(FeedbackType.Different): 0 - } - count = 0 - log_count = 0 - - for reference, df_reference in df.groupby('feedbacks.reference'): - # sort df since sorting before changed order - df_reference = df_reference.sort_index() - count += 1 - # make sure that automatic feedback was provided - if len(df_reference.index) > 1: - head = df_reference.head(1) - tail = df_reference.tail(1) - if df_reference.head(1)['feedbacks.type'].values[0] == 'AUTOMATIC': - # only the first and the last entry are important - score_first_feedbacks.append(head['feedbacks.credits'].values[0]) - score_last_feedbacks.append(tail['feedbacks.credits'].values[0]) - - # classify feedback - automatic_comment = head['feedbacks.detailText'].values[0] - human_comment = tail['feedbacks.detailText'].values[0] - - # only calculate duration once for each participation - participation_id = df_reference.head(1)['pID'].values[0] - if participation_id not in assessed_participations: - # calculate duration - duration = calculate_duration(head['_id'].values[0], tail['_id'].values[0]) - automatic_assessment_times.append(duration) - assessed_participations.append(participation_id) - - feedback_type = classify_comment(str(automatic_comment), str(human_comment)) - type_count[str(feedback_type)] += 1 - else: - # only calculate duration once for each participation - participation_id = df_reference.head(1)['pID'].values[0] - if participation_id not in assessed_participations: - duration = calculate_duration(head['_id'].values[0], tail['_id'].values[0]) - manual_assessment_times.append(duration) - assessed_participations.append(participation_id) - else: - if df_reference.head(1)['feedbacks.type'].values[0] == 'MANUAL': - log_count += 1 - - percentage_provided = round(len(score_first_feedbacks) / count * 100, 2) - - # print(log_count) - - kappa_val = cohens_kappa(score_first_feedbacks, score_last_feedbacks) - - diff = st_mean_diff(score_first_feedbacks, score_last_feedbacks) - - metrics = { - 'exerciseId': exercise_id, - 'sample_size_total': count, - 'sample_size_metrics': len(score_first_feedbacks), - 'percentage_provided': percentage_provided, - 'cohens_kappa': round(kappa_val, 4), - 'std_mean_score_diff': round(diff, 4), - 'comment_distribution': type_count, - 'percentage_automatic_feedback': round(type_count[str(FeedbackType.Automatic)] / len(score_first_feedbacks), 4), - 'automatic_assessment_duration': { - 'min_seconds': round(np.min(automatic_assessment_times), 2), - 'max_seconds': round(np.max(automatic_assessment_times), 2), - 'mean_seconds': round(np.mean(automatic_assessment_times), 2), - 'median_seconds': round(np.median(automatic_assessment_times), 2)}, - 'manual_assessment_duration': { - 'min_seconds': round(np.min(manual_assessment_times), 2), - 'max_seconds': round(np.max(manual_assessment_times), 2), - 'mean_seconds': round(np.mean(manual_assessment_times), 2), - 'median_seconds': round(np.median(manual_assessment_times), 2)} - } - - print(metrics) - return metrics diff --git a/tracking/start.py b/tracking/start.py deleted file mode 100644 index 242a3e3a..00000000 --- a/tracking/start.py +++ /dev/null @@ -1,9 +0,0 @@ -import uvicorn - - -def start(): - uvicorn.run("src.main:app", host="127.0.0.1", port=8004, reload=True, reload_dirs=["src"]) - - -if __name__ == "__main__": - start() diff --git a/traefik/traefik-dynamic.local.yml b/traefik/traefik-dynamic.local.yml index f8375cda..5216b711 100644 --- a/traefik/traefik-dynamic.local.yml +++ b/traefik/traefik-dynamic.local.yml @@ -23,12 +23,6 @@ http: entryPoints: - web service: embedding - # http://localhost/tracking - tracking: - rule: "Path(`/tracking`)" - entryPoints: - - web - service: tracking services: test: loadBalancer: @@ -49,7 +43,3 @@ http: loadBalancer: servers: - url: "http://host.docker.internal:8003" - tracking: - loadBalancer: - servers: - - url: "http://host.docker.internal:8004"