From dcad4a97f68c2644536d40596fe65837e5c6b6ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?= <peter@Peters-Air-2.home>
Date: Sun, 21 Apr 2024 20:27:40 +0200
Subject: [PATCH 01/16] Abstract model class. Added conftest string so commit
 wouldn't bug out

---
 checkthat/task1/models/transformer_model.py | 22 ++++++++++
 conftest.py                                 |  1 +
 tests/test_models/test_transformer_model.py | 47 +++++++++++++++++++++
 3 files changed, 70 insertions(+)
 create mode 100644 checkthat/task1/models/transformer_model.py
 create mode 100644 tests/test_models/test_transformer_model.py

diff --git a/checkthat/task1/models/transformer_model.py b/checkthat/task1/models/transformer_model.py
new file mode 100644
index 0000000..d82c05d
--- /dev/null
+++ b/checkthat/task1/models/transformer_model.py
@@ -0,0 +1,22 @@
+"""Module contains the transformer model for Task 1.
+
+Abstract class for later use.
+"""
+
+from abc import ABC, abstractmethod
+import torch.nn as nn
+
+
+class Model(ABC, nn.Module):
+    def __init__(self):
+        """Constructor for the Model class."""
+        super(Model, self).__init__()
+
+    @abstractmethod
+    def forward(self, x):
+        """Forward pass of the model.
+
+        Args:
+            x: Input tensor.
+        """
+        pass
diff --git a/conftest.py b/conftest.py
index e69de29..5405deb 100644
--- a/conftest.py
+++ b/conftest.py
@@ -0,0 +1 @@
+"""Module for pytest configuration and fixtures."""
diff --git a/tests/test_models/test_transformer_model.py b/tests/test_models/test_transformer_model.py
new file mode 100644
index 0000000..4381d36
--- /dev/null
+++ b/tests/test_models/test_transformer_model.py
@@ -0,0 +1,47 @@
+"""Tests for the transformer model."""
+
+import pytest
+import torch
+from checkthat.task1.models.transformer_model import Model
+
+
+class ConcreteModel(Model):
+    """Concrete model for testing purposes."""
+
+    def forward(self, x):
+        """Forward pass of the model.
+
+        This method takes an input tensor and returns an output tensor where
+        each element is doubled.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Output tensor, where each element is doubled.
+        """
+        return x * 2
+
+
+def test_model_cannot_be_instantiated():
+    """Test if the Model class cannot be instantiated.
+
+    This test checks whether attempting to instantiate the abstract
+    Model class raises a TypeError, as expected for abstract classes.
+    """
+    with pytest.raises(TypeError):
+        Model()  # Directly test instantiation without assignment
+
+
+def test_concrete_model():
+    """Test if the ConcreteModel class works correctly.
+
+    This test verifies that the ConcreteModel class's forward method
+    processes input tensors correctly by doubling each element.
+    """
+    x = torch.tensor([1.0, 2.0, 3.0])
+    model = ConcreteModel()
+    expected_output = torch.tensor([2.0, 4.0, 6.0])
+    assert torch.equal(
+        model(x), expected_output
+    ), "The output tensor does not match the expected doubled values."

From 12172f28258cb6293394ecdc7542cbb24ad66b69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?= <peter@Peters-Air-2.home>
Date: Wed, 1 May 2024 16:46:30 +0200
Subject: [PATCH 02/16] Added files for training and most tests. Made changes
 to requirements.txt.

---
 .gitignore                                    |  3 +
 checkthat/task1/__init__.py                   |  1 +
 checkthat/task1/main.py                       | 48 +++++++++++
 checkthat/task1/main_train_all.py             | 37 ++++++++
 checkthat/task1/metrics/__init__.py           |  8 ++
 checkthat/task1/metrics/compute_metrics.py    | 38 +++++++++
 checkthat/task1/metrics/metrics_logger.py     | 61 ++++++++++++++
 checkthat/task1/models/__init__.py            |  3 +
 ...transformer_model.py => abstract_model.py} | 17 ++--
 checkthat/task1/models/custom_model.py        | 29 +++++++
 checkthat/task1/tokenization/__init__.py      |  4 +
 .../normalize_DatasetDict_featues.py          | 22 +++++
 checkthat/task1/tokenization/tokenizer.py     | 41 +++++++++
 checkthat/task1/training_config.yaml          | 18 ++++
 checkthat/task1/training_scripts/__init__.py  |  4 +
 .../task1/training_scripts/train_config.py    | 19 +++++
 checkthat/task1/training_scripts/training.py  | 72 ++++++++++++++++
 requirements.txt                              | 76 +++++++++++++++++
 tests/__init__.py                             |  2 +-
 tests/metrics/test_compute_metrics.py         | 50 +++++++++++
 tests/metrics/test_metrics_logger.py          | 64 ++++++++++++++
 tests/models/__init__.py                      |  0
 .../test_abstract_model.py}                   | 12 +--
 tests/models/test_custom_model.py             | 41 +++++++++
 .../test_normalize_DatasetDict_features.py    | 84 +++++++++++++++++++
 tests/tokenization/test_tokenizer.py          | 34 ++++++++
 tests/training_scripts/test_train_config.py   | 23 +++++
 tests/training_scripts/test_training.py       | 34 ++++++++
 28 files changed, 824 insertions(+), 21 deletions(-)
 create mode 100644 checkthat/task1/__init__.py
 create mode 100644 checkthat/task1/main.py
 create mode 100644 checkthat/task1/main_train_all.py
 create mode 100644 checkthat/task1/metrics/__init__.py
 create mode 100644 checkthat/task1/metrics/compute_metrics.py
 create mode 100644 checkthat/task1/metrics/metrics_logger.py
 create mode 100644 checkthat/task1/models/__init__.py
 rename checkthat/task1/models/{transformer_model.py => abstract_model.py} (52%)
 create mode 100644 checkthat/task1/models/custom_model.py
 create mode 100644 checkthat/task1/tokenization/__init__.py
 create mode 100644 checkthat/task1/tokenization/normalize_DatasetDict_featues.py
 create mode 100644 checkthat/task1/tokenization/tokenizer.py
 create mode 100644 checkthat/task1/training_config.yaml
 create mode 100644 checkthat/task1/training_scripts/__init__.py
 create mode 100644 checkthat/task1/training_scripts/train_config.py
 create mode 100644 checkthat/task1/training_scripts/training.py
 create mode 100644 tests/metrics/test_compute_metrics.py
 create mode 100644 tests/metrics/test_metrics_logger.py
 create mode 100644 tests/models/__init__.py
 rename tests/{test_models/test_transformer_model.py => models/test_abstract_model.py} (76%)
 create mode 100644 tests/models/test_custom_model.py
 create mode 100644 tests/tokenization/test_normalize_DatasetDict_features.py
 create mode 100644 tests/tokenization/test_tokenizer.py
 create mode 100644 tests/training_scripts/test_train_config.py
 create mode 100644 tests/training_scripts/test_training.py

diff --git a/.gitignore b/.gitignore
index b6e4761..4059efb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Exclude .DS_Store files everywhere
+*.DS_Store
\ No newline at end of file
diff --git a/checkthat/task1/__init__.py b/checkthat/task1/__init__.py
new file mode 100644
index 0000000..a1ef5bd
--- /dev/null
+++ b/checkthat/task1/__init__.py
@@ -0,0 +1 @@
+"""init file for main module."""
diff --git a/checkthat/task1/main.py b/checkthat/task1/main.py
new file mode 100644
index 0000000..cc7ff58
--- /dev/null
+++ b/checkthat/task1/main.py
@@ -0,0 +1,48 @@
+"""Will run script to run training and testing. (test yet to be implemented)
+
+Argument parser is used to specify the model name and dataset name.
+"""
+import argparse
+from datasets import load_dataset
+from training_scripts.training import run_training
+from transformers import AutoTokenizer
+
+
+def main(args):
+    """Run training."""
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+
+    dataset = load_dataset(args.dataset)
+    label_map = {"No": 0, "Yes": 1}  # Label map for the dataset
+
+    seeds = [42, 81, 1024, 6, 10]  # Seeds for reproducibility
+    if args.train:
+        for seed in seeds:
+            run_training(seed, dataset, args.model_name, tokenizer, label_map)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Run training and testing.")
+
+    parser.add_argument(
+        "--train", action="store_true", help="Whether to run training"
+    )
+    parser.add_argument(
+        "--test", action="store_true", help="Whether to run testing"
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="FacebookAI/roberta-large",  # For English language
+        help="Name of the model",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="iai-group/clef2024_checkthat_task1_en",  # For English language
+        help="Name of the dataset",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
new file mode 100644
index 0000000..2725a2f
--- /dev/null
+++ b/checkthat/task1/main_train_all.py
@@ -0,0 +1,37 @@
+"""Main training script for training on all languages."""
+from datasets import load_dataset
+from tokenization.normalize_DatasetDict_featues import rename_features
+from transformers import AutoTokenizer
+from training_scripts.training import run_training
+
+
+def main():
+    en, ar, es, nl = (
+        "iai-group/clef2024_checkthat_task1_en",
+        "iai-group/clef2024_checkthat_task1_ar",
+        "iai-group/clef2024_checkthat_task1_es",
+        "iai-group/clef2024_checkthat_task1_nl",
+    )
+
+    dataset_list = [en, ar, es, nl]
+    label_map = {"No": 0, "Yes": 1}  # Label map for the dataset
+
+    model_name_en = "FacebookAI/roberta-large"
+    multilingual_model = "FacebookAI/xlm-roberta-large"
+
+    seeds = [42, 81, 1024, 6, 10]  # Seeds for reproducibility
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name_en)
+
+    for seed, dataset in zip(seeds, dataset_list):
+        dataset = load_dataset(dataset)
+        # Normalize dataset features if not already normalized (intended for twitter dataset)
+        if dataset["train"]["tweet_text"]:
+            dataset = rename_features(dataset)
+            tokenizer = AutoTokenizer.from_pretrained(multilingual_model)
+
+        run_training(seed, dataset, model_name_en, tokenizer, label_map)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/checkthat/task1/metrics/__init__.py b/checkthat/task1/metrics/__init__.py
new file mode 100644
index 0000000..f910853
--- /dev/null
+++ b/checkthat/task1/metrics/__init__.py
@@ -0,0 +1,8 @@
+from .compute_metrics import (
+    compute_metrics,
+    accuracy_metric,
+    precision_metric,
+    recall_metric,
+    f1_metric,
+)
+from .metrics_logger import MetricsLoggerCallback, compute_custom_metrics
diff --git a/checkthat/task1/metrics/compute_metrics.py b/checkthat/task1/metrics/compute_metrics.py
new file mode 100644
index 0000000..2ec982b
--- /dev/null
+++ b/checkthat/task1/metrics/compute_metrics.py
@@ -0,0 +1,38 @@
+"""Function to compute four metrics: accuracy, precision, recall, and F1-score.
+
+Metrics will be passed to wandb for logging.
+"""
+from evaluate import load
+
+"""Compute accuracy, precision, recall, and F1-score metrics."""
+accuracy_metric = load("accuracy")
+precision_metric = load("precision")
+recall_metric = load("recall")
+f1_metric = load("f1")
+
+
+def compute_metrics(eval_pred):
+    """Compute accuracy, precision, recall, and F1-score metrics.
+
+    Args:
+        eval_pred: Tuple of logits and labels.
+
+    Returns:
+        dict: Dictionary containing the computed metrics.
+    """
+    logits, labels = eval_pred
+    predictions = logits.argmax(-1)
+    return {
+        "accuracy": accuracy_metric.compute(
+            predictions=predictions, references=labels
+        )["accuracy"],
+        "precision": precision_metric.compute(
+            predictions=predictions, references=labels, average="weighted"
+        )["precision"],
+        "recall": recall_metric.compute(
+            predictions=predictions, references=labels, average="weighted"
+        )["recall"],
+        "f1": f1_metric.compute(
+            predictions=predictions, references=labels, average="weighted"
+        )["f1"],
+    }
diff --git a/checkthat/task1/metrics/metrics_logger.py b/checkthat/task1/metrics/metrics_logger.py
new file mode 100644
index 0000000..fedcd90
--- /dev/null
+++ b/checkthat/task1/metrics/metrics_logger.py
@@ -0,0 +1,61 @@
+"""Sets up the logging for the metrics using Weights and Biases."""
+import wandb
+import numpy as np
+from sklearn.metrics import precision_score, recall_score, f1_score
+from transformers import TrainerCallback
+
+
+def compute_custom_metrics(logits, labels):
+    """Compute precision, recall, and F1-score from model logits and true
+    labels.
+
+    Args:
+    logits (np.array): Logits returned by the model. Shape (num_samples, num_classes).
+    labels (np.array): True labels. Shape (num_samples,).
+
+    Returns:
+    tuple: precision, recall, F1-score
+    """
+
+    predictions = np.argmax(logits, axis=1)  # Convert logits to predictions
+
+    # Calculate metrics
+    precision = precision_score(labels, predictions, average="binary")
+    recall = recall_score(labels, predictions, average="binary")
+    f1 = f1_score(labels, predictions, average="binary")
+
+    return precision, recall, f1
+
+
+class MetricsLoggerCallback(TrainerCallback):
+    """Custom callback for logging additional metrics to wandb."""
+
+    def on_evaluate(self, args, state, **kwargs):
+        # Assuming 'logits' and 'labels' are part of the outputs collected during evaluation
+        logits = kwargs["logits"]
+        labels = kwargs["labels"]
+
+        # Compute custom metrics
+        precision, recall, f1 = compute_custom_metrics(logits, labels)
+
+        # Log custom metrics to wandb
+        wandb.log(
+            {
+                "precision": precision,
+                "recall": recall,
+                "f1_score": f1,
+                "epoch": state.epoch,
+            }
+        )
+
+
+callback_map = {
+    "MetricsLoggerCallback": MetricsLoggerCallback,
+}
+
+
+def get_callbacks(callback_names):
+    """Create a list of callback instances from a list of callback names."""
+    return [
+        callback_map[name]() for name in callback_names if name in callback_map
+    ]
diff --git a/checkthat/task1/models/__init__.py b/checkthat/task1/models/__init__.py
new file mode 100644
index 0000000..a0e880a
--- /dev/null
+++ b/checkthat/task1/models/__init__.py
@@ -0,0 +1,3 @@
+"""init file for models module."""
+from .custom_model import CustomModel
+from .abstract_model import Model
diff --git a/checkthat/task1/models/transformer_model.py b/checkthat/task1/models/abstract_model.py
similarity index 52%
rename from checkthat/task1/models/transformer_model.py
rename to checkthat/task1/models/abstract_model.py
index d82c05d..290fd34 100644
--- a/checkthat/task1/models/transformer_model.py
+++ b/checkthat/task1/models/abstract_model.py
@@ -1,22 +1,19 @@
 """Module contains the transformer model for Task 1.
 
-Abstract class for later use.
+Abstract indended as blueprint custom masked language model class.
 """
-
 from abc import ABC, abstractmethod
+import torch
 import torch.nn as nn
 
 
 class Model(ABC, nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         """Constructor for the Model class."""
         super(Model, self).__init__()
+        return None
 
     @abstractmethod
-    def forward(self, x):
-        """Forward pass of the model.
-
-        Args:
-            x: Input tensor.
-        """
-        pass
+    def forward(self, x) -> torch.Tensor:
+        """Forward pass of the model."""
+        return x
diff --git a/checkthat/task1/models/custom_model.py b/checkthat/task1/models/custom_model.py
new file mode 100644
index 0000000..cb5ed31
--- /dev/null
+++ b/checkthat/task1/models/custom_model.py
@@ -0,0 +1,29 @@
+"""Custom model for sequence classification tasks dervied from abstract class
+model.py."""
+from .abstract_model import Model
+from transformers import AutoModelForSequenceClassification
+
+
+class CustomModel(Model):
+    def __init__(self, model_name: str, num_labels: int):
+        """Constructor for the CustomModel class.
+
+        Args:
+            model_name (str): Accepts huggingface model name
+            num_labels (int): Number of labels in the dataset
+        """
+        super(CustomModel, self).__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_name, num_labels=num_labels
+        )
+
+    def forward(self, input_ids, attention_mask=None, labels=None):
+        """Forward pass of the model.
+
+        Including labels in the forward pass so the model can calculate
+        loss.
+        """
+        output = self.model(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        return output
diff --git a/checkthat/task1/tokenization/__init__.py b/checkthat/task1/tokenization/__init__.py
new file mode 100644
index 0000000..089eb1f
--- /dev/null
+++ b/checkthat/task1/tokenization/__init__.py
@@ -0,0 +1,4 @@
+"""init file for the tokenization module."""
+
+from .tokenizer import TextDataset
+from .normalize_DatasetDict_featues import rename_features
diff --git a/checkthat/task1/tokenization/normalize_DatasetDict_featues.py b/checkthat/task1/tokenization/normalize_DatasetDict_featues.py
new file mode 100644
index 0000000..a953dd6
--- /dev/null
+++ b/checkthat/task1/tokenization/normalize_DatasetDict_featues.py
@@ -0,0 +1,22 @@
+"""For the datasets that do not follow the english dataset format, we need to
+rename the features to match the english dataset format."""
+
+
+def rename_features(data) -> dict:
+    """Hacky function intended to use for twitter data to it uses same features
+    as other english dataset."""
+    # Iterate over each split (train, validation, test)
+    feature_name_mapping = {
+        "tweet_text": "Text",
+    }
+    for split_name in data.keys():
+        # Get the dataset for the current split
+        split_dataset = data[split_name]
+
+        # Rename each feature in the dataset using the mapping
+        for old_name, new_name in feature_name_mapping.items():
+            split_dataset = split_dataset.rename_column(old_name, new_name)
+
+        # Update the dataset in the DatasetDict
+        data[split_name] = split_dataset
+    return data
diff --git a/checkthat/task1/tokenization/tokenizer.py b/checkthat/task1/tokenization/tokenizer.py
new file mode 100644
index 0000000..b4e5848
--- /dev/null
+++ b/checkthat/task1/tokenization/tokenizer.py
@@ -0,0 +1,41 @@
+"""Tokenizer for the task1 datasets."""
+import torch
+from torch.utils.data import Dataset
+
+
+class TextDataset(Dataset):
+    """Takes a list of dictionaries containing text and class labels.
+
+    Args:
+        Dataset: Dataset class from torch.utils.data
+    """
+
+    def __init__(self, data, tokenizer, label_map):
+        """Initialize the TextDataset class."""
+        self.data = data
+        self.tokenizer = tokenizer
+        self.label_map = label_map
+
+    def __len__(self):
+        """Return the length of the dataset."""
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        """Tokenize the text and return a dictionary containing the
+        tokenized."""
+        item = self.data[idx]
+        encoded = self.tokenizer.encode_plus(
+            item["Text"],
+            add_special_tokens=True,
+            truncation=True,
+            padding="max_length",
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+
+        label_id = self.label_map[item["class_label"]]
+        return {
+            "input_ids": encoded["input_ids"].squeeze(0),
+            "attention_mask": encoded["attention_mask"].squeeze(0),
+            "labels": torch.tensor(label_id),
+        }
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
new file mode 100644
index 0000000..a9b9f41
--- /dev/null
+++ b/checkthat/task1/training_config.yaml
@@ -0,0 +1,18 @@
+training_arguments:
+  evaluation_strategy: 'IntervalStrategy.epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
+  # eval_steps: 500
+  # evaluation_strategy: 'steps'    # evaluate after some number of steps
+  output_dir: './results'          # output directory
+  save_total_limit: 5             # number of maximum checkpoints to save
+  num_train_epochs: 2             # number of training epochs
+  per_device_train_batch_size: 32 # batch size for training
+  per_device_eval_batch_size: 32  # batch size for evaluation
+  warmup_steps: 500               # number of warmup steps for learning rate scheduler
+  weight_decay: 0.01              # strength of weight decay
+  logging_dir: './logs'           # directory for storing logs
+  logging_steps: 10
+  load_best_model_at_end: True
+  metric_for_best_model: 'loss'    # metric to use for saving best model
+  report_to: 'wandb'               # report to wandb
+
+
diff --git a/checkthat/task1/training_scripts/__init__.py b/checkthat/task1/training_scripts/__init__.py
new file mode 100644
index 0000000..422193e
--- /dev/null
+++ b/checkthat/task1/training_scripts/__init__.py
@@ -0,0 +1,4 @@
+"""init file for training module."""
+
+from .training import run_training
+from .train_config import get_training_arguments
diff --git a/checkthat/task1/training_scripts/train_config.py b/checkthat/task1/training_scripts/train_config.py
new file mode 100644
index 0000000..96c8651
--- /dev/null
+++ b/checkthat/task1/training_scripts/train_config.py
@@ -0,0 +1,19 @@
+"""Module to load training arguments from a yaml file."""
+
+import yaml
+from transformers import TrainingArguments
+
+
+def load_config(file_path):
+    """Load configuration from a yaml file."""
+    with open(file_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+
+
+def get_training_arguments():
+    """Unpack training arguments from the config file and return as a
+    TrainingArguments object."""
+    config = load_config("training_config.yaml")
+    training_args = config["training_arguments"]
+    return TrainingArguments(**training_args)
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
new file mode 100644
index 0000000..c6e849a
--- /dev/null
+++ b/checkthat/task1/training_scripts/training.py
@@ -0,0 +1,72 @@
+"""Training script for the model.
+
+This script trains the model for a single seed.
+"""
+import wandb
+from transformers import Trainer, EarlyStoppingCallback
+from checkthat.task1.tokenization.tokenizer import TextDataset
+from checkthat.task1.models.custom_model import CustomModel
+from checkthat.task1.metrics.compute_metrics import compute_metrics
+from checkthat.task1.training_scripts.train_config import get_training_arguments
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed):
+    """Set seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def run_training(seed, dataset, model_name, tokenizer, label_map):
+    """Start training the model for a single seed.
+
+    Args:
+        seed: seed for reproducibility
+        dataset: dataset dictionary containing train and validation splits
+        model_name: huggingface model name
+        tokenizer: huggerface tokenizer/same as model name
+        label_map: dictionary mapping labels to integers
+    """
+    # Initialize wandb run
+    set_seed(seed)
+    run_name = f"{model_name}_{seed}"
+    wandb.init(
+        project="Clef2024",
+        entity="aarnes",
+        name=run_name,
+        config={"seed": seed},
+    )
+
+    # Prepare datasets
+    train_dataset = TextDataset(dataset["train"], tokenizer, label_map)
+    eval_dataset = TextDataset(dataset["validation"], tokenizer, label_map)
+    test_dataset = TextDataset(dataset["test"], tokenizer, label_map)
+
+    training_arguments = get_training_arguments()
+    training_arguments.run_name = (
+        run_name  # Optional, sync the name with Trainer's internal wandb run
+    )
+
+    # Creating a Trainer instance with training arguments and datasets
+    trainer = Trainer(
+        model=CustomModel(model_name, num_labels=len(label_map)),
+        args=training_arguments,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        test_dataset=test_dataset,
+        compute_metrics=compute_metrics,
+        callbacks=[
+            EarlyStoppingCallback(early_stopping_patience=3)
+        ],  # Early stopping callback
+    )
+
+    # Train the model
+    trainer.train()
+
+    # Finish the wandb run after each seed
+    wandb.finish()
diff --git a/requirements.txt b/requirements.txt
index 7cd7337..7f5bd59 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,79 @@ docformatter
 pre-commit
 pydocstyle==6.1.1
 deep_translator
+
+accelerate==0.29.3
+aiohttp==3.9.5
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.3
+attrs==23.2.0
+Brotli==1.0.9
+certifi==2024.2.2
+chardet==4.0.0
+charset-normalizer==2.0.4
+click==8.1.7
+datasets==2.19.0
+dill==0.3.8
+docker-pycreds==0.4.0
+evaluate==0.4.2
+exceptiongroup==1.2.1
+filelock==3.13.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+huggingface-hub==0.22.2
+idna==3.4
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+Jinja2==3.1.3
+joblib==1.4.0
+MarkupSafe==2.1.5
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+networkx==3.2.1
+numpy==1.26.4
+packaging==23.2
+pandas==2.2.2
+pillow==10.3.0
+pip==23.3.1
+pluggy==1.5.0
+protobuf==4.25.3
+psutil==5.9.8
+pyarrow==16.0.0
+pyarrow-hotfix==0.6
+PySocks==1.7.1
+pytest==8.1.1
+pytest-mock==3.14.0
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.31.0
+safetensors==0.4.2
+scikit-learn==1.4.2
+scipy==1.13.0
+sentry-sdk==1.45.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smmap==5.0.1
+sympy==1.12
+threadpoolctl==3.4.0
+tokenizers==0.19.0
+toml==0.10.2
+tomli==2.0.1
+torch==2.2.2
+torchaudio==2.2.2
+torchvision==0.17.2
+tqdm==4.65.0
+transformers==4.40.0
+types-PyYAML==6.0.12.20240311
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.1.0
+wandb==0.16.6
+wheel==0.41.2
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.17.0
diff --git a/tests/__init__.py b/tests/__init__.py
index f4c5f52..8e84bfd 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1 +1 @@
-"""Module level init for tests."""
+"""Module level init for tests."""
diff --git a/tests/metrics/test_compute_metrics.py b/tests/metrics/test_compute_metrics.py
new file mode 100644
index 0000000..ac8d3bb
--- /dev/null
+++ b/tests/metrics/test_compute_metrics.py
@@ -0,0 +1,50 @@
+import pytest
+import numpy as np
+from unittest.mock import MagicMock
+from evaluate import load
+from checkthat.task1.metrics.compute_metrics import compute_metrics
+
+
+@pytest.fixture
+def mock_metrics(mocker):
+    """Mock the load function to return predefined metric values.
+
+    The mock function returns predefined values for accuracy, precision,
+    recall, and f1.
+    """
+    mocker.patch(
+        "evaluate.load",
+        side_effect=lambda metric_name: MagicMock(
+            compute=MagicMock(
+                return_value={
+                    "accuracy": 0.75,
+                    "precision": 0.875,
+                    "recall": 0.75,
+                    "f1": 0.7666666666666667,
+                }
+                if metric_name in ["accuracy", "precision", "recall", "f1"]
+                else None
+            )
+        ),
+    )
+
+
+def test_compute_metrics(mock_metrics):
+    """Test the compute_metrics function."""
+    logits = np.array(
+        [
+            [0.1, 0.9],  # Predicted class 1
+            [0.8, 0.2],  # Predicted class 0
+            [0.6, 0.4],  # Predicted class 0 (incorrect, should be 1)
+            [0.3, 0.7],  # Predicted class 1 (correct)
+        ]
+    )
+    labels = np.array([1, 0, 1, 1])
+    eval_pred = (logits, labels)
+
+    results = compute_metrics(eval_pred)
+
+    assert results["accuracy"] == 0.75
+    assert results["precision"] == pytest.approx(0.875)
+    assert results["recall"] == 0.75
+    assert results["f1"] == pytest.approx(0.7666666666666667)
diff --git a/tests/metrics/test_metrics_logger.py b/tests/metrics/test_metrics_logger.py
new file mode 100644
index 0000000..b6191eb
--- /dev/null
+++ b/tests/metrics/test_metrics_logger.py
@@ -0,0 +1,64 @@
+"""Test for the metrics_logger module."""
+
+import numpy as np
+from unittest.mock import patch
+from checkthat.task1.metrics.metrics_logger import (
+    compute_custom_metrics,
+    MetricsLoggerCallback,
+    get_callbacks,
+)  # Replace 'your_module' with the actual module name
+
+# Test for compute_custom_metrics
+def test_compute_custom_metrics():
+    """Mock test for compute_custom_metrics."""
+    # Define mock logits and labels
+    logits = np.array([[0.1, 0.9], [0.8, 0.2], [0.55, 0.45]])
+    labels = np.array([1, 0, 1])
+
+    # Expected results
+    precision, recall, f1 = compute_custom_metrics(logits, labels)
+
+    # Assert conditions
+    assert precision >= 0, "Precision should be non-negative"
+    assert recall >= 0, "Recall should be non-negative"
+    assert f1 >= 0, "F1 score should be non-negative"
+    # You can add more detailed assertions here based on known input and output
+
+
+# Test for MetricsLoggerCallback
+@patch(
+    "checkthat.task1.metrics.metrics_logger.wandb.log"
+)  # Mock the wandb.log method
+def test_metrics_logger_callback(mock_log):
+    """Mock test for MetricsLoggerCallback."""
+    # Create an instance of MetricsLoggerCallback
+    callback = MetricsLoggerCallback()
+
+    # Create mock arguments
+    args = None  # Depending on the real use case, populate this correctly
+    state = type(
+        "state", (object,), {"epoch": 1}
+    )  # Mock state with an epoch attribute
+    logits = np.array(
+        [[10, 0], [0, 10]]
+    )  # Very clear separation of class predictions
+    labels = np.array([0, 1])  # Correct labels aligning with logits
+
+    # Execute the on_evaluate method
+    callback.on_evaluate(args, state, logits=logits, labels=labels)
+
+    # Check that wandb.log was called with expected values
+    mock_log.assert_called_with(
+        {"precision": 1.0, "recall": 1.0, "f1_score": 1.0, "epoch": 1}
+    )
+
+
+# Test for get_callbacks
+def test_get_callbacks():
+    """Test for get_callbacks."""
+    # Get callback instances
+    callbacks = get_callbacks(["MetricsLoggerCallback"])
+    # Check that the correct callbacks are returned
+    assert len(callbacks) == 1 and isinstance(
+        callbacks[0], MetricsLoggerCallback
+    ), "Should return an instance of MetricsLoggerCallback"
diff --git a/tests/models/__init__.py b/tests/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_models/test_transformer_model.py b/tests/models/test_abstract_model.py
similarity index 76%
rename from tests/test_models/test_transformer_model.py
rename to tests/models/test_abstract_model.py
index 4381d36..e741ff6 100644
--- a/tests/test_models/test_transformer_model.py
+++ b/tests/models/test_abstract_model.py
@@ -1,8 +1,7 @@
 """Tests for the transformer model."""
-
 import pytest
 import torch
-from checkthat.task1.models.transformer_model import Model
+from checkthat.task1.models.abstract_model import Model
 
 
 class ConcreteModel(Model):
@@ -14,9 +13,6 @@ def forward(self, x):
         This method takes an input tensor and returns an output tensor where
         each element is doubled.
 
-        Args:
-            x (torch.Tensor): Input tensor.
-
         Returns:
             torch.Tensor: Output tensor, where each element is doubled.
         """
@@ -24,11 +20,7 @@ def forward(self, x):
 
 
 def test_model_cannot_be_instantiated():
-    """Test if the Model class cannot be instantiated.
-
-    This test checks whether attempting to instantiate the abstract
-    Model class raises a TypeError, as expected for abstract classes.
-    """
+    """Test if the Model class cannot be instantiated."""
     with pytest.raises(TypeError):
         Model()  # Directly test instantiation without assignment
 
diff --git a/tests/models/test_custom_model.py b/tests/models/test_custom_model.py
new file mode 100644
index 0000000..380ec6e
--- /dev/null
+++ b/tests/models/test_custom_model.py
@@ -0,0 +1,41 @@
+"""Tests for the CustomModel class."""
+import pytest
+from unittest.mock import patch
+from transformers import AutoModelForSequenceClassification
+import torch
+from checkthat.task1.models.custom_model import CustomModel
+
+
+def test_custom_model_initialization():
+    """Test for the initialization of the CustomModel class.
+
+    The model should be initialized with the correct model name and
+    number of labels.
+    """
+    model_name = "bert-base-uncased"
+    num_labels = 2
+    with patch.object(
+        AutoModelForSequenceClassification, "from_pretrained", return_value=None
+    ) as mock_method:
+        model = CustomModel(model_name, num_labels)
+        mock_method.assert_called_once_with(model_name, num_labels=num_labels)
+
+
+def test_custom_model_forward():
+    """Test for the forward method of the CustomModel class.
+
+    The forward method should return a dictionary with the key 'loss'
+    when labels are provided.
+    """
+    model_name = "bert-base-uncased"
+    num_labels = 2
+    model = CustomModel(model_name, num_labels)
+    input_ids = torch.randint(0, 1000, (1, 10))
+    attention_mask = torch.ones(1, 10)
+    labels = torch.tensor([1])
+
+    with patch.object(
+        AutoModelForSequenceClassification, "from_pretrained", return_value=None
+    ):
+        output = model.forward(input_ids, attention_mask, labels)
+        assert "loss" in output.keys()
diff --git a/tests/tokenization/test_normalize_DatasetDict_features.py b/tests/tokenization/test_normalize_DatasetDict_features.py
new file mode 100644
index 0000000..ff3228d
--- /dev/null
+++ b/tests/tokenization/test_normalize_DatasetDict_features.py
@@ -0,0 +1,84 @@
+"""Test cases for the normalize_DatasetDict_features function."""
+import pytest
+from datasets import DatasetDict, Dataset
+from checkthat.task1.tokenization.normalize_DatasetDict_featues import (
+    rename_features,
+)
+
+
+@pytest.fixture
+def sample_data():
+    """Fixture providing sample data."""
+    # Create a DatasetDict with sample data
+    train_data = {
+        "tweet_id": [1, 2, 3],
+        "tweet_text": ["text1", "text2", "text3"],
+        "class_label": [0, 1, 0],
+    }
+    validation_data = {
+        "tweet_id": [4, 5],
+        "tweet_text": ["text4", "text5"],
+        "class_label": [1, 0],
+    }
+    test_data = {
+        "tweet_id": [6, 7],
+        "tweet_text": ["text6", "text7"],
+        "class_label": [0, 1],
+    }
+    return DatasetDict(
+        {
+            "train": Dataset.from_dict(train_data),
+            "validation": Dataset.from_dict(validation_data),
+            "test": Dataset.from_dict(test_data),
+        }
+    )
+
+
+@pytest.fixture
+def expected_data():
+    """Fixture providing the expected data after renaming 'tweet_text' to
+    'Text'."""
+    # Define the expected result after renaming 'tweet_text' to 'Text'
+    train_data = {
+        "tweet_id": [1, 2, 3],
+        "Text": ["text1", "text2", "text3"],
+        "class_label": [0, 1, 0],
+    }
+    validation_data = {
+        "tweet_id": [4, 5],
+        "Text": ["text4", "text5"],
+        "class_label": [1, 0],
+    }
+    test_data = {
+        "tweet_id": [6, 7],
+        "Text": ["text6", "text7"],
+        "class_label": [0, 1],
+    }
+    return DatasetDict(
+        {
+            "train": Dataset.from_dict(train_data),
+            "validation": Dataset.from_dict(validation_data),
+            "test": Dataset.from_dict(test_data),
+        }
+    )
+
+
+def test_rename_features(sample_data, expected_data):
+    """Test for the rename_features function."""
+    # Call the function to rename features
+    result = rename_features(sample_data)
+
+    # Compare individual datasets within result and expected_data
+    for split_name in sample_data.keys():
+        result_dataset = result[split_name]
+        expected_dataset = expected_data[split_name]
+
+        # Check if feature names are the same
+        assert result_dataset.features == expected_dataset.features
+
+        # Check if number of rows is the same
+        assert len(result_dataset) == len(expected_dataset)
+
+        # Check if each row in result matches corresponding row in expected_data
+        for result_row, expected_row in zip(result_dataset, expected_dataset):
+            assert result_row == expected_row
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
new file mode 100644
index 0000000..3973e75
--- /dev/null
+++ b/tests/tokenization/test_tokenizer.py
@@ -0,0 +1,34 @@
+import pytest
+from checkthat.task1.tokenization.tokenizer import TextDataset
+from transformers import AutoTokenizer
+
+
+def test_text_dataset_length():
+    """Test the length of the TextDataset.
+
+    The length of the dataset should be equal to the number of data
+    samples.
+    """
+    data = [{"Text": "Example text", "class_label": "Yes"}]
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    label_map = {"Yes": 1}
+
+    dataset = TextDataset(data, tokenizer, label_map)
+    assert len(dataset) == 1
+
+
+def test_text_dataset_getitem():
+    """Test the __getitem__ method of the TextDataset.
+
+    The __getitem__ method should return a dictionary with the keys
+    'input_ids', 'attention_mask', and 'labels'.
+    """
+    data = [{"Text": "Example text", "class_label": "Yes"}]
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    label_map = {"Yes": 1}
+
+    dataset = TextDataset(data, tokenizer, label_map)
+    item = dataset[0]
+    assert "input_ids" in item
+    assert "attention_mask" in item
+    assert "labels" in item
diff --git a/tests/training_scripts/test_train_config.py b/tests/training_scripts/test_train_config.py
new file mode 100644
index 0000000..a41a361
--- /dev/null
+++ b/tests/training_scripts/test_train_config.py
@@ -0,0 +1,23 @@
+import pytest
+from unittest.mock import patch, mock_open
+from checkthat.task1.training_scripts.train_config import (
+    load_config,
+    get_training_arguments,
+)
+
+
+def test_load_config():
+    """Test for load_config."""
+    with patch("builtins.open", mock_open(read_data="training_arguments: {}")):
+        config = load_config("dummy_path")
+        assert config == {"training_arguments": {}}
+
+
+def test_get_training_arguments():
+    """Test for get_training_arguments."""
+    with patch(
+        "checkthat.task1.training_scripts.train_config.load_config",
+        return_value={"training_arguments": {"output_dir": "test"}},
+    ):
+        training_args = get_training_arguments()
+        assert training_args.output_dir == "test"
diff --git a/tests/training_scripts/test_training.py b/tests/training_scripts/test_training.py
new file mode 100644
index 0000000..5c743fc
--- /dev/null
+++ b/tests/training_scripts/test_training.py
@@ -0,0 +1,34 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from checkthat.task1.training_scripts.training import run_training
+
+
+@patch("checkthat.task1.training_scripts.training.wandb.init")
+@patch("checkthat.task1.training_scripts.training.Trainer")
+@patch(
+    "checkthat.task1.training_scripts.train_config.load_config"
+)  # Mock load_config function
+def test_run_training(mock_load_config, mock_trainer, mock_wandb_init):
+    # Setup the mocks
+    mock_trainer.return_value.train.return_value = None
+    mock_wandb_init.return_value = None
+    mock_load_config.return_value = {
+        "training_arguments": {
+            "output_dir": "some/path",
+            "evaluation_strategy": "steps",
+            # Add other needed arguments
+        }
+    }
+
+    # Call the function
+    run_training(
+        seed=42,
+        dataset={"train": [], "validation": [], "test": []},
+        model_name="bert-base-uncased",
+        tokenizer=MagicMock(),
+        label_map={},
+    )
+
+    # Assertions
+    mock_wandb_init.assert_called_once()
+    mock_trainer.assert_called_once()

From ce8166b0b02b286ec65ba369c570487411e958cc Mon Sep 17 00:00:00 2001
From: = <peteraarnes95£@gmail.com>
Date: Wed, 1 May 2024 19:15:58 +0200
Subject: [PATCH 03/16] Small fixes

---
 checkthat/task1/main_train_all.py             | 27 ++++++++++++-------
 checkthat/task1/models/custom_model.py        |  4 ++-
 checkthat/task1/training_config.yaml          |  8 +++---
 .../task1/training_scripts/train_config.py    |  2 +-
 checkthat/task1/training_scripts/training.py  | 17 +++++++-----
 requirements.txt                              |  1 -
 6 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
index 2725a2f..97e37dd 100644
--- a/checkthat/task1/main_train_all.py
+++ b/checkthat/task1/main_train_all.py
@@ -1,8 +1,8 @@
 """Main training script for training on all languages."""
 from datasets import load_dataset
 from tokenization.normalize_DatasetDict_featues import rename_features
-from transformers import AutoTokenizer
 from training_scripts.training import run_training
+from transformers import AutoTokenizer
 
 
 def main():
@@ -23,15 +23,24 @@ def main():
 
     tokenizer = AutoTokenizer.from_pretrained(model_name_en)
 
-    for seed, dataset in zip(seeds, dataset_list):
-        dataset = load_dataset(dataset)
-        # Normalize dataset features if not already normalized (intended for twitter dataset)
-        if dataset["train"]["tweet_text"]:
-            dataset = rename_features(dataset)
-            tokenizer = AutoTokenizer.from_pretrained(multilingual_model)
-
-        run_training(seed, dataset, model_name_en, tokenizer, label_map)
+    for dataset in dataset_list:
+        for seed in seeds:
+            dataset = load_dataset(dataset)
+            # Normalize dataset features if not already normalized (intended for twitter dataset)
+            if "tweet_text" in dataset["train"].column_names:
+                dataset = rename_features(dataset)
+                tokenizer = AutoTokenizer.from_pretrained(multilingual_model)
+                run_training(seed, dataset, multilingual_model, tokenizer, label_map)
+            else:
+                run_training(seed, dataset, model_name_en, tokenizer, label_map)
 
 
 if __name__ == "__main__":
+    import torch
+
+    print(torch.cuda.is_available())
+    print(torch.cuda.current_device())
+    print(torch.cuda.device(0))
+    print(torch.cuda.device_count())
+    print(torch.cuda.get_device_name(0))
     main()
diff --git a/checkthat/task1/models/custom_model.py b/checkthat/task1/models/custom_model.py
index cb5ed31..07f7cf6 100644
--- a/checkthat/task1/models/custom_model.py
+++ b/checkthat/task1/models/custom_model.py
@@ -5,7 +5,7 @@
 
 
 class CustomModel(Model):
-    def __init__(self, model_name: str, num_labels: int):
+    def __init__(self, model_name: str, num_labels: int, device: str):
         """Constructor for the CustomModel class.
 
         Args:
@@ -13,6 +13,8 @@ def __init__(self, model_name: str, num_labels: int):
             num_labels (int): Number of labels in the dataset
         """
         super(CustomModel, self).__init__()
+        if device is not None:
+            self.to(device)
         self.model = AutoModelForSequenceClassification.from_pretrained(
             model_name, num_labels=num_labels
         )
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
index a9b9f41..b74fa79 100644
--- a/checkthat/task1/training_config.yaml
+++ b/checkthat/task1/training_config.yaml
@@ -1,12 +1,12 @@
 training_arguments:
-  evaluation_strategy: 'IntervalStrategy.epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
+  evaluation_strategy: 'epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
   # eval_steps: 500
   # evaluation_strategy: 'steps'    # evaluate after some number of steps
   output_dir: './results'          # output directory
   save_total_limit: 5             # number of maximum checkpoints to save
   num_train_epochs: 2             # number of training epochs
-  per_device_train_batch_size: 32 # batch size for training
-  per_device_eval_batch_size: 32  # batch size for evaluation
+  per_device_train_batch_size: 16 # batch size for training
+  per_device_eval_batch_size: 16  # batch size for evaluation
   warmup_steps: 500               # number of warmup steps for learning rate scheduler
   weight_decay: 0.01              # strength of weight decay
   logging_dir: './logs'           # directory for storing logs
@@ -14,5 +14,5 @@ training_arguments:
   load_best_model_at_end: True
   metric_for_best_model: 'loss'    # metric to use for saving best model
   report_to: 'wandb'               # report to wandb
-
+  save_strategy: 'epoch'           # save model after each epoch
 
diff --git a/checkthat/task1/training_scripts/train_config.py b/checkthat/task1/training_scripts/train_config.py
index 96c8651..69121ba 100644
--- a/checkthat/task1/training_scripts/train_config.py
+++ b/checkthat/task1/training_scripts/train_config.py
@@ -14,6 +14,6 @@ def load_config(file_path):
 def get_training_arguments():
     """Unpack training arguments from the config file and return as a
     TrainingArguments object."""
-    config = load_config("training_config.yaml")
+    config = load_config("checkthat/task1/training_config.yaml")
     training_args = config["training_arguments"]
     return TrainingArguments(**training_args)
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index c6e849a..b8847c4 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -4,14 +4,18 @@
 """
 import wandb
 from transformers import Trainer, EarlyStoppingCallback
-from checkthat.task1.tokenization.tokenizer import TextDataset
-from checkthat.task1.models.custom_model import CustomModel
-from checkthat.task1.metrics.compute_metrics import compute_metrics
-from checkthat.task1.training_scripts.train_config import get_training_arguments
+from tokenization.tokenizer import TextDataset
+from models.custom_model import CustomModel
+from metrics.compute_metrics import compute_metrics
+from training_scripts.train_config import get_training_arguments
 import random
 import numpy as np
 import torch
-
+import os
+import torch.cuda
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
 
 def set_seed(seed):
     """Set seed for reproducibility."""
@@ -54,11 +58,10 @@ def run_training(seed, dataset, model_name, tokenizer, label_map):
 
     # Creating a Trainer instance with training arguments and datasets
     trainer = Trainer(
-        model=CustomModel(model_name, num_labels=len(label_map)),
+        model=CustomModel(model_name, num_labels=len(label_map), device='cuda'),
         args=training_arguments,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        test_dataset=test_dataset,
         compute_metrics=compute_metrics,
         callbacks=[
             EarlyStoppingCallback(early_stopping_patience=3)
diff --git a/requirements.txt b/requirements.txt
index 7f5bd59..0bb079a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -41,7 +41,6 @@ numpy==1.26.4
 packaging==23.2
 pandas==2.2.2
 pillow==10.3.0
-pip==23.3.1
 pluggy==1.5.0
 protobuf==4.25.3
 psutil==5.9.8

From 856c8df6bb85c78da11c54e1819303dc17b506e7 Mon Sep 17 00:00:00 2001
From: = <peteraarnes95£@gmail.com>
Date: Wed, 1 May 2024 21:42:14 +0200
Subject: [PATCH 04/16] Optimized code and fixed save for main_train_all.py so
 folders get organised for each model and language

---
 checkthat/task1/main_train_all.py             |  31 ++++++++----------
 checkthat/task1/training_config.yaml          |   4 +--
 .../task1/training_scripts/train_config.py    |  18 ++++++----
 checkthat/task1/training_scripts/training.py  |   6 ++--
 requirements.txt                              | Bin 1336 -> 3520 bytes
 5 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
index 97e37dd..9498045 100644
--- a/checkthat/task1/main_train_all.py
+++ b/checkthat/task1/main_train_all.py
@@ -3,37 +3,34 @@
 from tokenization.normalize_DatasetDict_featues import rename_features
 from training_scripts.training import run_training
 from transformers import AutoTokenizer
-
+from training_scripts.train_config import get_training_arguments
 
 def main():
-    en, ar, es, nl = (
+    dataset_list = [
         "iai-group/clef2024_checkthat_task1_en",
         "iai-group/clef2024_checkthat_task1_ar",
         "iai-group/clef2024_checkthat_task1_es",
         "iai-group/clef2024_checkthat_task1_nl",
-    )
-
-    dataset_list = [en, ar, es, nl]
-    label_map = {"No": 0, "Yes": 1}  # Label map for the dataset
-
-    model_name_en = "FacebookAI/roberta-large"
-    multilingual_model = "FacebookAI/xlm-roberta-large"
-
-    seeds = [42, 81, 1024, 6, 10]  # Seeds for reproducibility
+    ]
+    label_map = {"No": 0, "Yes": 1}
 
+    model_name_en = "distilbert/distilroberta-base"
+    multilingual_model = "FacebookAI/xlm-roberta-base"
+    seeds = [42, 81, 1024, 6, 10]
     tokenizer = AutoTokenizer.from_pretrained(model_name_en)
 
-    for dataset in dataset_list:
+    for dataset_name in dataset_list:
         for seed in seeds:
-            dataset = load_dataset(dataset)
-            # Normalize dataset features if not already normalized (intended for twitter dataset)
+            dataset = load_dataset(dataset_name)
             if "tweet_text" in dataset["train"].column_names:
                 dataset = rename_features(dataset)
                 tokenizer = AutoTokenizer.from_pretrained(multilingual_model)
-                run_training(seed, dataset, multilingual_model, tokenizer, label_map)
+                training_args = get_training_arguments(multilingual_model, seed, dataset_name)
+                # run training with these arguments
             else:
-                run_training(seed, dataset, model_name_en, tokenizer, label_map)
-
+                training_args = get_training_arguments(model_name_en, seed, dataset_name)
+                run_training(seed, dataset, model_name_en, tokenizer, label_map, training_args)
+                # run training with these arguments
 
 if __name__ == "__main__":
     import torch
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
index b74fa79..3480178 100644
--- a/checkthat/task1/training_config.yaml
+++ b/checkthat/task1/training_config.yaml
@@ -1,10 +1,11 @@
 training_arguments:
+  save_strategy: 'epoch'           # save model after each epoch
   evaluation_strategy: 'epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
   # eval_steps: 500
   # evaluation_strategy: 'steps'    # evaluate after some number of steps
   output_dir: './results'          # output directory
   save_total_limit: 5             # number of maximum checkpoints to save
-  num_train_epochs: 2             # number of training epochs
+  num_train_epochs: 5             # number of training epochs
   per_device_train_batch_size: 16 # batch size for training
   per_device_eval_batch_size: 16  # batch size for evaluation
   warmup_steps: 500               # number of warmup steps for learning rate scheduler
@@ -14,5 +15,4 @@ training_arguments:
   load_best_model_at_end: True
   metric_for_best_model: 'loss'    # metric to use for saving best model
   report_to: 'wandb'               # report to wandb
-  save_strategy: 'epoch'           # save model after each epoch
 
diff --git a/checkthat/task1/training_scripts/train_config.py b/checkthat/task1/training_scripts/train_config.py
index 69121ba..277f7fb 100644
--- a/checkthat/task1/training_scripts/train_config.py
+++ b/checkthat/task1/training_scripts/train_config.py
@@ -1,19 +1,23 @@
-"""Module to load training arguments from a yaml file."""
-
 import yaml
 from transformers import TrainingArguments
 
-
 def load_config(file_path):
     """Load configuration from a yaml file."""
     with open(file_path, "r") as file:
         config = yaml.safe_load(file)
     return config
 
-
-def get_training_arguments():
+def get_training_arguments(model_name, seed, dataset_name):
     """Unpack training arguments from the config file and return as a
-    TrainingArguments object."""
+    TrainingArguments object, with dynamically adjusted output directory based on model name, seed, and dataset."""
     config = load_config("checkthat/task1/training_config.yaml")
     training_args = config["training_arguments"]
-    return TrainingArguments(**training_args)
+
+    # Extract a short language identifier from the dataset name
+    language_code = dataset_name.split('_')[-1]  # Assuming the dataset name ends with a language code
+
+    # Modify the output_dir dynamically
+    model_name_safe = model_name.replace('/', '_')  # Replace '/' with '_' for filesystem compatibility
+    training_args['output_dir'] = f"./results/{model_name_safe}_seed_{seed}_{language_code}"
+
+    return TrainingArguments(**training_args)
\ No newline at end of file
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index b8847c4..3b9a744 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -26,7 +26,7 @@ def set_seed(seed):
         torch.cuda.manual_seed_all(seed)
 
 
-def run_training(seed, dataset, model_name, tokenizer, label_map):
+def run_training(seed, dataset, model_name, tokenizer, label_map, training_arguments):
     """Start training the model for a single seed.
 
     Args:
@@ -44,14 +44,14 @@ def run_training(seed, dataset, model_name, tokenizer, label_map):
         entity="aarnes",
         name=run_name,
         config={"seed": seed},
-    )
+    )   
 
     # Prepare datasets
     train_dataset = TextDataset(dataset["train"], tokenizer, label_map)
     eval_dataset = TextDataset(dataset["validation"], tokenizer, label_map)
     test_dataset = TextDataset(dataset["test"], tokenizer, label_map)
 
-    training_arguments = get_training_arguments()
+    # training_arguments = get_training_arguments()
     training_arguments.run_name = (
         run_name  # Optional, sync the name with Trainer's internal wandb run
     )
diff --git a/requirements.txt b/requirements.txt
index 0bb079ace081eaecc8180d3892136ecee15c0951..5d2c97de202f66d51c1e76a6f556f6e2f28e23e0 100644
GIT binary patch
literal 3520
zcmZ{nOOF#t5QOWDw7&!y;r1{LhdnHpl~y1japJV{8-pK>+lHC>@$ANz(dlZ>va*JG
zWL9QmWMo$H&(CR@>$5CdeI3iRob>&7eO{Iqr73Up@1*=zp2@N)hw@qWQ)$CvA{+Ks
z1A8a0b=k{%8$RPW|Due0pH^oV@;R37V!x5kO>ghYgHF%ONLe;zCoG3@*3Tfq4^)W<
zBpIjCU%`t9XlFV(jgy<Q()X?M5vf&_j5Q8_N1Mq5CccKXiCpAsBHCN!IVjs!m^e*U
zAbAs3*gFQ^Rc~)(bqo+W##}ho<$Ih5tE&O)T&zwNVWBwJmAaO+cVQoipM$)2!SSX%
zMs`rS`s8vRXtSa_Wd!x>9`Qj|qD&(Wr>O@_x8bpn<)km?bgzg%j*c4_c$Z?7UJ+?8
zZ&*A<<m71_dEH++ztt!6!NXmjZ==&@c=k*CK|HN4mD77z&n&M38*DD*L)BPELieEG
zj{0q(_?fqIzf?!i>N#*%<=5UX%e^8{CyG3%%C+KIOPSrwkR1guNwAyTQ{_1H=_y#r
zIt_$(f#|jDuy0mZ5uL8?<!4qh52!Lmr+Qj>%_D{t=S(%$TIWxy+>E+vA~?pMVf`S6
zPEsJ1k;gI6f+yp%xLND7iON?(4jUt#IQMbM*M$14^i3#vAAGG8ov1KGk0!mo@L@}2
zM(cx#Z-s<uqV}2cumezHDv}e)nCk%9MyRL&{@0g`m5ILs{};t+E;+rM=-L;ZWdo3r
z_*kk&-s{v+{bGB3)Xys+%9`->{@V$`Tvj#@R&pCB8K{eih3U)y)9(3BvA4k&6EfFN
z_}qrQ#!sI%6har!K+Q$QanJ8gyrcchH6AD|Z6igvS1y#s-iE)rl`D69d7R3R=*w4~
zAST$W>e)!%2K#=)!3#Ub_3c$33{*T#ROha@;)Pk6UQ|Yg+~}}}$37HLHxb#bH>~hF
zoCkAJ5w@z2&2vy4<^+%0fmSx6<c1mEQQiePd3bsvYBea^mC_rg12yjSVg8b@ss?wE
zJvmdXRd_gyXT&vkvaxf#FR^gjdSAjNs$>&oMX8rWGW1|xxxttZcaZwM>xl(kI>-)w
zP{yIc>7^$PPw&@jKQdo?)g9-KrCyBfgW`NtM!H@V2R}6rx+Dwi<?Z!fEBoQ;rmd3;
z-rQ}Y&<|0EtfCNDg$*_j87qG$-LBSR-seUD{!itP^1i$adoph;3RvsQJU`vc-m3U~
z6B5^0RXqtFji6J9*)=(#>_vNm)_OrHop{`cNwhYK-k}U{bGq`>6IMI}@LgwsyPmCm
zlzpVA3&l*oo7moo3rx9%P$=8liXk5nT_sz^ELa`p$(WgrsiKg<^ON20gaTF1WM^jS
z$ug!XZ+y>w&Ro9i$i{tMwd>8yoAP@5@bUu%+MpBME_9z)5!=@jM;%p5Oe?Pn?i#<j
zYA=SbBK&rJ1p4I7oB3?>E0QYgVJ!5Qx1JT}{o0i0`rBDi8hu^+@xEnG9iE)H_WCZ5
z-0nnnMrJKBeN&WO#Gu;SL8t4xF83lGaNYRcL^P{{y}^5=lXm`pKHND^(@Uu&-y%Nk
zH^$$J^6PtvoPJMc&PQQqZ~G-t-`(cw9D2U)Tje12c2-U3jN5;5;S$9<3k>oi;f;vS
z^Nu4MEPDIF`VV<<Pt_MJ_i^sodsW3tzw3LP`Z88;Yb%f3Y$e~1kIK*gn?TN)@!(zY
EFOOvTmH+?%

literal 1336
zcmY*ZyOP^D5X|Ra$l^n*-3=Jrg;(b)S2?=K1%X2m76`z=&@1ZeuLoKO4hQq>?ipOX
zu{T=0x$zsNL%0ZSM>=SZc8Pg2BAh8@Zfx8(Cy0Tk$l-aK&TZfJG&85~l1(VyNX$gW
zTK1fcAf#)D_jV-X;wqA+X>8xy7d4B!2IHrxZ%5QHDa|eyHBaq{OgVyW#BCgRnWj_Q
zX$=vL7{+#JJNi3E@lF%E_MI$eaSP0$8&2>KWLJ}C{!H$wSy-B&n7n&%Ml4cKWWBSu
zX?g>eSDK9&glU_$e@C0yc?^m6?VD0^<J_dfW}Xp(Y((%c<M&UjXia>#oW#X&&2b0f
z9^>@1IL{vW%0RD(mmD7)yek+{LF1*A#I`~}K!?)ne!aSIU5w>s-7l3*#pdQ<w9tv%
zmSP4Emu6!zI{h@g0#;8hI2*&_u1}mD{o}$9Gt~Yd^GCcO>)E=aFDBpi^vx_#4AVlP
zEkXKK{UZk1b}w#r7PFpF9d3O{Ia)52h*z{cQ(*as`F7VP3OejtIw}Ka%$l%iM)9Br
z{IJ7pD$|N7f#~>wVCA}_<bl2BzNiJU<7HoT6^0j#pUN%{!n|OebZ<@0HCd-a_|tCI
zD2u~1U3;dl$2ZWAc90pPJy-oH`){k+ehZ|!B4Ul90jK=zHK145rdXspA5|$8?(6a2
zUtj)xQqxoBE2AeCv8p)Y;pdLCsz^tyD;4l897@!6)+HLKSa);Mcs4och@Wpy_OLHV
z;vo5G%8V6)Us=JTss!Rhk5QEx^=(fQqetoUTfKy$6yQ+74V7&(*hyq#?s9y}(+?zT
zW-})wS^=-vEksMaanM_-bJly1xYc`rOh|&EH&?6S^eq+!T2>DXv(GO6@$$<R9EO7K
zUp4CIys4yTMN@we{c>tM`~s?^9EnTw2f-+h8}G)xS1;D-JKvQ}6vmj)Ln)TW^A_rN
z(9@nhwtduD)cb+WtJefh&h)V|S4=0ID&6m^DZrr&^n^u%tB3L@dbm_SFpS>O{~$V(
AjsO4v


From cc4c9705a77300aee5a8c9846d99e3c8fe078589 Mon Sep 17 00:00:00 2001
From: = <peteraarnes95£@gmail.com>
Date: Thu, 2 May 2024 00:04:42 +0200
Subject: [PATCH 05/16] Small fix and changed traning config to evaluate on
 step instead of epoch

---
 checkthat/task1/main_train_all.py    |  7 +++----
 checkthat/task1/training_config.yaml | 10 +++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
index 9498045..500573a 100644
--- a/checkthat/task1/main_train_all.py
+++ b/checkthat/task1/main_train_all.py
@@ -14,8 +14,8 @@ def main():
     ]
     label_map = {"No": 0, "Yes": 1}
 
-    model_name_en = "distilbert/distilroberta-base"
-    multilingual_model = "FacebookAI/xlm-roberta-base"
+    model_name_en = "FacebookAI/roberta-large"
+    multilingual_model = "FacebookAI/xlm-roberta-large"
     seeds = [42, 81, 1024, 6, 10]
     tokenizer = AutoTokenizer.from_pretrained(model_name_en)
 
@@ -26,11 +26,10 @@ def main():
                 dataset = rename_features(dataset)
                 tokenizer = AutoTokenizer.from_pretrained(multilingual_model)
                 training_args = get_training_arguments(multilingual_model, seed, dataset_name)
-                # run training with these arguments
+                run_training(seed, dataset, model_name_en, tokenizer, label_map, training_args)
             else:
                 training_args = get_training_arguments(model_name_en, seed, dataset_name)
                 run_training(seed, dataset, model_name_en, tokenizer, label_map, training_args)
-                # run training with these arguments
 
 if __name__ == "__main__":
     import torch
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
index 3480178..a4e9d66 100644
--- a/checkthat/task1/training_config.yaml
+++ b/checkthat/task1/training_config.yaml
@@ -1,11 +1,11 @@
 training_arguments:
-  save_strategy: 'epoch'           # save model after each epoch
-  evaluation_strategy: 'epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
-  # eval_steps: 500
-  # evaluation_strategy: 'steps'    # evaluate after some number of steps
+  # save_strategy: 'epoch'           # save model after each epoch
+  # evaluation_strategy: 'epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
+  eval_steps: 500
+  evaluation_strategy: 'steps'    # evaluate after some number of steps
   output_dir: './results'          # output directory
   save_total_limit: 5             # number of maximum checkpoints to save
-  num_train_epochs: 5             # number of training epochs
+  num_train_epochs: 50             # number of training epochs
   per_device_train_batch_size: 16 # batch size for training
   per_device_eval_batch_size: 16  # batch size for evaluation
   warmup_steps: 500               # number of warmup steps for learning rate scheduler

From 232757ab8db90dba3376a842851f14c873a530a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Thu, 2 May 2024 10:06:40 +0200
Subject: [PATCH 06/16] Added so that wandb will report what language model is
 being used

---
 checkthat/task1/training_scripts/__init__.py     | 1 +
 checkthat/task1/training_scripts/train_config.py | 7 ++++++-
 checkthat/task1/training_scripts/training.py     | 5 +++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/checkthat/task1/training_scripts/__init__.py b/checkthat/task1/training_scripts/__init__.py
index 422193e..57c4fe1 100644
--- a/checkthat/task1/training_scripts/__init__.py
+++ b/checkthat/task1/training_scripts/__init__.py
@@ -2,3 +2,4 @@
 
 from .training import run_training
 from .train_config import get_training_arguments
+from .train_config import get_language
diff --git a/checkthat/task1/training_scripts/train_config.py b/checkthat/task1/training_scripts/train_config.py
index 277f7fb..9de1546 100644
--- a/checkthat/task1/training_scripts/train_config.py
+++ b/checkthat/task1/training_scripts/train_config.py
@@ -20,4 +20,9 @@ def get_training_arguments(model_name, seed, dataset_name):
     model_name_safe = model_name.replace('/', '_')  # Replace '/' with '_' for filesystem compatibility
     training_args['output_dir'] = f"./results/{model_name_safe}_seed_{seed}_{language_code}"
 
-    return TrainingArguments(**training_args)
\ No newline at end of file
+    return TrainingArguments(**training_args)
+
+def get_language(dataset_name):
+    """Extract the language code from the dataset name."""
+    dataset_language = dataset_name.split('_')[-1]  # Assuming the dataset name ends with a language code
+    return dataset_language
\ No newline at end of file
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index 3b9a744..f34fb2d 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -8,6 +8,7 @@
 from models.custom_model import CustomModel
 from metrics.compute_metrics import compute_metrics
 from training_scripts.train_config import get_training_arguments
+from training_scripts.train_config import get_language
 import random
 import numpy as np
 import torch
@@ -26,7 +27,7 @@ def set_seed(seed):
         torch.cuda.manual_seed_all(seed)
 
 
-def run_training(seed, dataset, model_name, tokenizer, label_map, training_arguments):
+def run_training(seed, dataset, model_name, tokenizer, label_map, training_arguments, dataset_language):
     """Start training the model for a single seed.
 
     Args:
@@ -38,7 +39,7 @@ def run_training(seed, dataset, model_name, tokenizer, label_map, training_argum
     """
     # Initialize wandb run
     set_seed(seed)
-    run_name = f"{model_name}_{seed}"
+    run_name = f"{model_name}_{seed}_{dataset_language}"
     wandb.init(
         project="Clef2024",
         entity="aarnes",

From aac9f8f4c076b6043549d9c607aa55d4e5434e8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Thu, 2 May 2024 14:10:58 +0200
Subject: [PATCH 07/16] Fixing some errors in logic

---
 checkthat/task1/main_train_all.py             |  6 +--
 checkthat/task1/metrics/metrics_logger.py     |  6 +--
 checkthat/task1/test_scripts/__init__.py      |  0
 .../test_scripts/load_from_checkpoints.py     | 37 +++++++++++++++++++
 .../task1/test_scripts/test_devtest_test.py   | 15 ++++++++
 checkthat/task1/training_config.yaml          | 13 ++++---
 checkthat/task1/training_scripts/training.py  |  2 -
 7 files changed, 65 insertions(+), 14 deletions(-)
 create mode 100644 checkthat/task1/test_scripts/__init__.py
 create mode 100644 checkthat/task1/test_scripts/load_from_checkpoints.py
 create mode 100644 checkthat/task1/test_scripts/test_devtest_test.py

diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
index 500573a..6a30194 100644
--- a/checkthat/task1/main_train_all.py
+++ b/checkthat/task1/main_train_all.py
@@ -12,15 +12,15 @@ def main():
         "iai-group/clef2024_checkthat_task1_es",
         "iai-group/clef2024_checkthat_task1_nl",
     ]
-    label_map = {"No": 0, "Yes": 1}
+    label_map = {"Yes": 1, "No": 0}
 
     model_name_en = "FacebookAI/roberta-large"
     multilingual_model = "FacebookAI/xlm-roberta-large"
     seeds = [42, 81, 1024, 6, 10]
     tokenizer = AutoTokenizer.from_pretrained(model_name_en)
 
-    for dataset_name in dataset_list:
-        for seed in seeds:
+    for seed in seeds:
+        for dataset_name in dataset_list:
             dataset = load_dataset(dataset_name)
             if "tweet_text" in dataset["train"].column_names:
                 dataset = rename_features(dataset)
diff --git a/checkthat/task1/metrics/metrics_logger.py b/checkthat/task1/metrics/metrics_logger.py
index fedcd90..309b438 100644
--- a/checkthat/task1/metrics/metrics_logger.py
+++ b/checkthat/task1/metrics/metrics_logger.py
@@ -20,9 +20,9 @@ def compute_custom_metrics(logits, labels):
     predictions = np.argmax(logits, axis=1)  # Convert logits to predictions
 
     # Calculate metrics
-    precision = precision_score(labels, predictions, average="binary")
-    recall = recall_score(labels, predictions, average="binary")
-    f1 = f1_score(labels, predictions, average="binary")
+    precision = precision_score(labels, predictions, average="macro", pos_label=1)
+    recall = recall_score(labels, predictions, average="macro", pos_label=1)
+    f1 = f1_score(labels, predictions, average="macro", pos_label=1)
 
     return precision, recall, f1
 
diff --git a/checkthat/task1/test_scripts/__init__.py b/checkthat/task1/test_scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/checkthat/task1/test_scripts/load_from_checkpoints.py b/checkthat/task1/test_scripts/load_from_checkpoints.py
new file mode 100644
index 0000000..eee440f
--- /dev/null
+++ b/checkthat/task1/test_scripts/load_from_checkpoints.py
@@ -0,0 +1,37 @@
+import os
+from transformers import AutoModelForSequenceClassification
+import torch
+
+def find_latest_checkpoint(model_dir):
+    """Find the latest checkpoint in the given directory."""
+    checkpoint_dirs = [os.path.join(model_dir, d) for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) and 'checkpoint' in d]
+    if not checkpoint_dirs:
+        raise ValueError("No checkpoint directories found in the given model directory.")
+
+    # Sort directories to find the one with the highest step (assuming naming convention includes "checkpoint-<step>")
+    latest_checkpoint = sorted(checkpoint_dirs, key=lambda x: int(x.split('-')[-1]))[-1]
+    return latest_checkpoint
+
+def load_model_from_dir(base_dir):
+    """Load models from a structured directory of models.
+    Args:
+        base_dir (str): Directory containing subdirectories of models named like 'FacebookAI/xlm-roberta-base_10_en'
+    Returns:
+        models (dict): Dictionary with keys as model names and values as loaded model objects.
+    """
+    models = {}
+    for model_name in os.listdir(base_dir):
+        model_path = os.path.join(base_dir, model_name)
+        if not os.path.isdir(model_path):
+            continue
+        
+        try:
+            latest_checkpoint = find_latest_checkpoint(model_path)
+            model = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint)
+            models[model_name] = model
+            print(f"Loaded model from {latest_checkpoint}")
+        except Exception as e:
+            print(f"Failed to load model from {model_path}: {str(e)}")
+
+    return models
+
diff --git a/checkthat/task1/test_scripts/test_devtest_test.py b/checkthat/task1/test_scripts/test_devtest_test.py
new file mode 100644
index 0000000..456abac
--- /dev/null
+++ b/checkthat/task1/test_scripts/test_devtest_test.py
@@ -0,0 +1,15 @@
+from load_from_checkpoints import load_model_from_dir
+
+
+base_dir = "./results"
+models = load_model_from_dir(base_dir)
+
+
+dataset_list = [
+    "iai-group/clef2024_checkthat_task1_en",
+    "iai-group/clef2024_checkthat_task1_ar",
+    ]
+
+
+
+dataset = load_dataset(dataset_name)
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
index a4e9d66..a6680ad 100644
--- a/checkthat/task1/training_config.yaml
+++ b/checkthat/task1/training_config.yaml
@@ -1,11 +1,12 @@
 training_arguments:
-  # save_strategy: 'epoch'           # save model after each epoch
-  # evaluation_strategy: 'epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
-  eval_steps: 500
-  evaluation_strategy: 'steps'    # evaluate after some number of steps
+  save_strategy: 'epoch'           # save model after each epoch
+  evaluation_strategy: 'epoch' # To change evaluation strategy comment out the line and uncomment the next line two lines
+  greater_is_better: True
+  # eval_steps: 500
+  # evaluation_strategy: 'steps'    # evaluate after some number of steps
   output_dir: './results'          # output directory
   save_total_limit: 5             # number of maximum checkpoints to save
-  num_train_epochs: 50             # number of training epochs
+  num_train_epochs: 100             # number of training epochs
   per_device_train_batch_size: 16 # batch size for training
   per_device_eval_batch_size: 16  # batch size for evaluation
   warmup_steps: 500               # number of warmup steps for learning rate scheduler
@@ -13,6 +14,6 @@ training_arguments:
   logging_dir: './logs'           # directory for storing logs
   logging_steps: 10
   load_best_model_at_end: True
-  metric_for_best_model: 'loss'    # metric to use for saving best model
+  metric_for_best_model: "f1"     # metric to use for saving best model
   report_to: 'wandb'               # report to wandb
 
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index f34fb2d..5cfa9d9 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -12,7 +12,6 @@
 import random
 import numpy as np
 import torch
-import os
 import torch.cuda
 import torch
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -50,7 +49,6 @@ def run_training(seed, dataset, model_name, tokenizer, label_map, training_argum
     # Prepare datasets
     train_dataset = TextDataset(dataset["train"], tokenizer, label_map)
     eval_dataset = TextDataset(dataset["validation"], tokenizer, label_map)
-    test_dataset = TextDataset(dataset["test"], tokenizer, label_map)
 
     # training_arguments = get_training_arguments()
     training_arguments.run_name = (

From d3f35a6f02d91fb2bf153e7aed151cb6146eb412 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Thu, 2 May 2024 15:35:31 +0200
Subject: [PATCH 08/16] Creating run tests that should handle both labeled and
 unlabeled data

---
 .gitignore                                    |  5 ++-
 checkthat/task1/main_train_all.py             | 20 +++++++++++
 .../test_scripts/load_from_checkpoints.py     |  1 +
 checkthat/task1/test_scripts/run_tests.py     | 34 +++++++++++++++++++
 .../task1/test_scripts/test_devtest_test.py   | 15 --------
 checkthat/task1/training_config.yaml          |  5 ++-
 checkthat/task1/training_scripts/training.py  | 11 +++---
 7 files changed, 66 insertions(+), 25 deletions(-)
 create mode 100644 checkthat/task1/test_scripts/run_tests.py
 delete mode 100644 checkthat/task1/test_scripts/test_devtest_test.py

diff --git a/.gitignore b/.gitignore
index 4059efb..1f08207 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,7 @@ dmypy.json
 .pyre/
 
 # Exclude .DS_Store files everywhere
-*.DS_Store
\ No newline at end of file
+*.DS_Store
+
+# Exclude node_modules
+.vscode/
\ No newline at end of file
diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
index 6a30194..cb40b3f 100644
--- a/checkthat/task1/main_train_all.py
+++ b/checkthat/task1/main_train_all.py
@@ -4,6 +4,8 @@
 from training_scripts.training import run_training
 from transformers import AutoTokenizer
 from training_scripts.train_config import get_training_arguments
+from test_scripts.load_from_checkpoints import load_model_from_dir
+from test_scripts.run_tests import run_testing
 
 def main():
     dataset_list = [
@@ -19,6 +21,7 @@ def main():
     seeds = [42, 81, 1024, 6, 10]
     tokenizer = AutoTokenizer.from_pretrained(model_name_en)
 
+    """Training model on trainset for each seed and each language"""
     for seed in seeds:
         for dataset_name in dataset_list:
             dataset = load_dataset(dataset_name)
@@ -31,6 +34,23 @@ def main():
                 training_args = get_training_arguments(model_name_en, seed, dataset_name)
                 run_training(seed, dataset, model_name_en, tokenizer, label_map, training_args)
 
+
+
+    """Testing model on testset"""
+    base_dir = "./results"
+    models = load_model_from_dir(base_dir)
+    for model_name, model in models.items():
+        i += 1 # Incrementing i to get the model name for each model
+        for dataset_name in dataset_list:
+            dataset = load_dataset(dataset_name)
+            if "tweet_text" in dataset["test"].column_names:
+                dataset = rename_features(dataset)
+                tokenizer = AutoTokenizer.from_pretrained(multilingual_model)
+                run_testing(model, dataset, tokenizer, label_map)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_name)
+                run_testing(model, dataset, tokenizer, label_map, model.keys()[i]) # model.keys()[i] to get the model name
+
 if __name__ == "__main__":
     import torch
 
diff --git a/checkthat/task1/test_scripts/load_from_checkpoints.py b/checkthat/task1/test_scripts/load_from_checkpoints.py
index eee440f..2c8cb6f 100644
--- a/checkthat/task1/test_scripts/load_from_checkpoints.py
+++ b/checkthat/task1/test_scripts/load_from_checkpoints.py
@@ -35,3 +35,4 @@ def load_model_from_dir(base_dir):
 
     return models
 
+    
\ No newline at end of file
diff --git a/checkthat/task1/test_scripts/run_tests.py b/checkthat/task1/test_scripts/run_tests.py
new file mode 100644
index 0000000..d4b101d
--- /dev/null
+++ b/checkthat/task1/test_scripts/run_tests.py
@@ -0,0 +1,34 @@
+import torch
+import wandb
+from tokenization.tokenizer import TextDataset
+from models.custom_model import CustomModel
+from metrics.compute_metrics import compute_metrics
+
+def run_testing(model_name, dataset, tokenizer, label_map, model_named_trained):
+    """Run testing on the given model and dataset."""
+     
+    run_name = f"TEST__{model_named_trained}"
+    wandb.init(project="Clef2024", entity="aarnes", name=run_name)
+
+    # Assuming TextDataset provides the input in the correct format
+    test_dataset = TextDataset(dataset["test"], tokenizer, label_map)
+    model = CustomModel(model_name=model_name, num_labels=len(label_map), device='cuda')
+    model.eval()
+
+    logits = []
+    labels = []
+
+    with torch.no_grad():
+        for batch in test_dataset:
+            input_ids, attention_mask, label = batch['input_ids'].to('cuda'), batch['attention_mask'].to('cuda'), batch['labels'].to('cuda')
+            output = model(input_ids=input_ids, attention_mask=attention_mask)
+            logits.append(output.logits)  # Adjust according to how outputs are structured
+            labels.append(label)
+
+        logits = torch.cat(logits)
+        labels = torch.cat(labels)
+        predictions = logits.argmax(-1)
+        metrics = compute_metrics((predictions, labels))
+        wandb.log(metrics)
+        wandb.finish()
+
diff --git a/checkthat/task1/test_scripts/test_devtest_test.py b/checkthat/task1/test_scripts/test_devtest_test.py
deleted file mode 100644
index 456abac..0000000
--- a/checkthat/task1/test_scripts/test_devtest_test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from load_from_checkpoints import load_model_from_dir
-
-
-base_dir = "./results"
-models = load_model_from_dir(base_dir)
-
-
-dataset_list = [
-    "iai-group/clef2024_checkthat_task1_en",
-    "iai-group/clef2024_checkthat_task1_ar",
-    ]
-
-
-
-dataset = load_dataset(dataset_name)
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
index a6680ad..e59b0e1 100644
--- a/checkthat/task1/training_config.yaml
+++ b/checkthat/task1/training_config.yaml
@@ -5,7 +5,7 @@ training_arguments:
   # eval_steps: 500
   # evaluation_strategy: 'steps'    # evaluate after some number of steps
   output_dir: './results'          # output directory
-  save_total_limit: 5             # number of maximum checkpoints to save
+  save_total_limit: 3             # number of maximum checkpoints to save
   num_train_epochs: 100             # number of training epochs
   per_device_train_batch_size: 16 # batch size for training
   per_device_eval_batch_size: 16  # batch size for evaluation
@@ -15,5 +15,4 @@ training_arguments:
   logging_steps: 10
   load_best_model_at_end: True
   metric_for_best_model: "f1"     # metric to use for saving best model
-  report_to: 'wandb'               # report to wandb
-
+  report_to: 'wandb'               # report to wandb
\ No newline at end of file
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index 5cfa9d9..cc23944 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -25,20 +25,19 @@ def set_seed(seed):
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(seed)
 
-
-def run_training(seed, dataset, model_name, tokenizer, label_map, training_arguments, dataset_language):
+def run_training(seed, dataset, model, tokenizer, label_map, training_arguments, dataset_language):
     """Start training the model for a single seed.
 
     Args:
         seed: seed for reproducibility
         dataset: dataset dictionary containing train and validation splits
-        model_name: huggingface model name
+        model: huggingface model name
         tokenizer: huggerface tokenizer/same as model name
         label_map: dictionary mapping labels to integers
     """
     # Initialize wandb run
     set_seed(seed)
-    run_name = f"{model_name}_{seed}_{dataset_language}"
+    run_name = f"{model}_{seed}_{dataset_language}"
     wandb.init(
         project="Clef2024",
         entity="aarnes",
@@ -57,11 +56,11 @@ def run_training(seed, dataset, model_name, tokenizer, label_map, training_argum
 
     # Creating a Trainer instance with training arguments and datasets
     trainer = Trainer(
-        model=CustomModel(model_name, num_labels=len(label_map), device='cuda'),
+        model=CustomModel(model, num_labels=len(label_map), device='cuda'),
         args=training_arguments,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        compute_metrics=compute_metrics,
+        compute_metrics=compute_metrics(),
         callbacks=[
             EarlyStoppingCallback(early_stopping_patience=3)
         ],  # Early stopping callback

From 8aca646eca22d89a6c93ae8b50564c1dbe692e93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Thu, 2 May 2024 17:01:26 +0200
Subject: [PATCH 09/16] Starting to create an script to test all models. Next
 iteration will do it dynamically according to language.

---
 checkthat/task1/main_test_all.py              | 116 ++++++++++++++++++
 checkthat/task1/main_train_all.py             |  20 +--
 .../test_scripts/load_from_checkpoints.py     |  38 ------
 checkthat/task1/test_scripts/run_tests.py     |  99 ++++++++++++---
 checkthat/task1/tokenization/tokenizer.py     |  24 ++--
 checkthat/task1/training_config.yaml          |   2 +-
 6 files changed, 213 insertions(+), 86 deletions(-)
 create mode 100644 checkthat/task1/main_test_all.py

diff --git a/checkthat/task1/main_test_all.py b/checkthat/task1/main_test_all.py
new file mode 100644
index 0000000..efe38ba
--- /dev/null
+++ b/checkthat/task1/main_test_all.py
@@ -0,0 +1,116 @@
+import os
+import torch
+import pandas as pd
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from task1.models.custom_model import CustomModel
+from task1.tokenization.tokenizer import TextDataset
+from task1.metrics.compute_metrics import compute_metrics
+import wandb
+from tokenization.tokenizer import TextDataset
+
+def find_latest_checkpoint(model_dir):
+    """Find the latest checkpoint in the given directory."""
+    checkpoint_dirs = [os.path.join(model_dir, d) for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) and 'checkpoint' in d]
+    if not checkpoint_dirs:
+        raise ValueError("No checkpoint directories found in the given model directory.")
+    latest_checkpoint = sorted(checkpoint_dirs, key=lambda x: int(x.split('-')[-1]))[-1]
+    return latest_checkpoint
+
+def find_models_with_checkpoints(base_dir):
+    """Find models and their latest checkpoints in a structured directory."""
+    model_info = []
+    for model_name in os.listdir(base_dir):
+        model_path = os.path.join(base_dir, model_name)
+        if not os.path.isdir(model_path):
+            continue
+        try:
+            latest_checkpoint = find_latest_checkpoint(model_path)
+            model_info.append((model_name, latest_checkpoint))
+            print(f"Model: {model_name}, Latest Checkpoint: {latest_checkpoint}")
+        except Exception as e:
+            print(f"Failed to find checkpoint for model {model_name}: {str(e)}")
+    return model_info
+
+def run_prediction(model_name, dataset_list, tokenizer, model_path, has_labels: bool):
+    """Run prediction on the dataset, compute metrics if labels are present, and write results to a .tsv file."""
+    device = 'cuda'
+    label_map = {0: 'no', 1: 'yes'}
+    
+    # Detect language from model name and select dataset
+    lang = model_name.split('_')[-2]  # Assumes format like 'modelname_lang_'
+    dataset = dataset_list[lang]
+
+    # Initialize Weights & Biases
+    run_name = f"TEST__{model_path}"
+    wandb.init(project="Clef2024", entity="aarnes", name=run_name)
+
+    # Load the model from the checkpoint
+    model = CustomModel.from_pretrained(model_path)
+    model.to(device)
+    model.eval()
+
+    # Load the dataset, with or without labels
+    test_dataset = TextDataset(dataset["test"], tokenizer, None if not has_labels else label_map)
+    all_logits = []
+    all_labels = []
+    results = []
+
+    with torch.no_grad():
+        for i, batch in enumerate(test_dataset):
+            input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
+            output = model(input_ids=input_ids, attention_mask=attention_mask)
+            logits = output.logits
+            predictions = logits.argmax(-1).cpu().numpy()
+
+            # Collect logits and labels for metric calculation if labels are present
+            if has_labels and 'labels' in batch:
+                labels = batch['labels'].cpu().numpy()
+                all_logits.append(logits)
+                all_labels.append(torch.tensor(labels))
+                for label, pred in zip(labels, predictions):
+                    results.append((i, label_map[pred], model_name))
+            else:
+                for pred in predictions:
+                    results.append((i, label_map[pred], model_name))
+
+    # If labels were present, calculate metrics
+    if has_labels:
+        all_logits = torch.cat(all_logits)
+        all_labels = torch.cat(all_labels)
+        predictions = all_logits.argmax(-1)
+        metrics = compute_metrics((predictions, all_labels))
+        wandb.log(metrics)
+
+    # Save results to a .tsv file
+    df = pd.DataFrame(results, columns=['sentence_id', 'prediction', 'model_name'])
+    df.to_csv(f"{model_path}_predictions.tsv", sep='\t', index=False)
+
+    # Finish Weights & Biases logging
+    wandb.finish()
+
+
+if __name__ == "__main__":
+    # Define the dataset list for each language
+    dataset_list = {
+        "en":"iai-group/clef2024_checkthat_task1_en",
+        "ar":"iai-group/clef2024_checkthat_task1_ar",
+        "es":"iai-group/clef2024_checkthat_task1_es",
+        "nl":"iai-group/clef2024_checkthat_task1_nl",
+    }
+
+    label_map = {"Yes": 1, "No": 0}
+
+
+    # Load models and run prediction
+    base_dir = "./trained_models"
+    i = 0
+
+    model_info = find_models_with_checkpoints(base_dir)
+    for model_name, checkpoint_path in model_info:
+        tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)  # General tokenizer
+        model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)  # Model for prediction
+        tokenized_data = TextDataset(dataset_list.values()[i], tokenizer, label_map)
+        run_prediction(model_name, dataset_list, tokenizer, checkpoint_path, has_labels=True)
+        i += 1
+        
+    
diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
index cb40b3f..6303fce 100644
--- a/checkthat/task1/main_train_all.py
+++ b/checkthat/task1/main_train_all.py
@@ -4,8 +4,8 @@
 from training_scripts.training import run_training
 from transformers import AutoTokenizer
 from training_scripts.train_config import get_training_arguments
-from test_scripts.load_from_checkpoints import load_model_from_dir
-from test_scripts.run_tests import run_testing
+from test_scripts.load_from_checkpoints import find_latest_checkpoint
+from test_scripts.run_tests import run_prediction
 
 def main():
     dataset_list = [
@@ -35,22 +35,6 @@ def main():
                 run_training(seed, dataset, model_name_en, tokenizer, label_map, training_args)
 
 
-
-    """Testing model on testset"""
-    base_dir = "./results"
-    models = load_model_from_dir(base_dir)
-    for model_name, model in models.items():
-        i += 1 # Incrementing i to get the model name for each model
-        for dataset_name in dataset_list:
-            dataset = load_dataset(dataset_name)
-            if "tweet_text" in dataset["test"].column_names:
-                dataset = rename_features(dataset)
-                tokenizer = AutoTokenizer.from_pretrained(multilingual_model)
-                run_testing(model, dataset, tokenizer, label_map)
-            else:
-                tokenizer = AutoTokenizer.from_pretrained(model_name)
-                run_testing(model, dataset, tokenizer, label_map, model.keys()[i]) # model.keys()[i] to get the model name
-
 if __name__ == "__main__":
     import torch
 
diff --git a/checkthat/task1/test_scripts/load_from_checkpoints.py b/checkthat/task1/test_scripts/load_from_checkpoints.py
index 2c8cb6f..e69de29 100644
--- a/checkthat/task1/test_scripts/load_from_checkpoints.py
+++ b/checkthat/task1/test_scripts/load_from_checkpoints.py
@@ -1,38 +0,0 @@
-import os
-from transformers import AutoModelForSequenceClassification
-import torch
-
-def find_latest_checkpoint(model_dir):
-    """Find the latest checkpoint in the given directory."""
-    checkpoint_dirs = [os.path.join(model_dir, d) for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) and 'checkpoint' in d]
-    if not checkpoint_dirs:
-        raise ValueError("No checkpoint directories found in the given model directory.")
-
-    # Sort directories to find the one with the highest step (assuming naming convention includes "checkpoint-<step>")
-    latest_checkpoint = sorted(checkpoint_dirs, key=lambda x: int(x.split('-')[-1]))[-1]
-    return latest_checkpoint
-
-def load_model_from_dir(base_dir):
-    """Load models from a structured directory of models.
-    Args:
-        base_dir (str): Directory containing subdirectories of models named like 'FacebookAI/xlm-roberta-base_10_en'
-    Returns:
-        models (dict): Dictionary with keys as model names and values as loaded model objects.
-    """
-    models = {}
-    for model_name in os.listdir(base_dir):
-        model_path = os.path.join(base_dir, model_name)
-        if not os.path.isdir(model_path):
-            continue
-        
-        try:
-            latest_checkpoint = find_latest_checkpoint(model_path)
-            model = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint)
-            models[model_name] = model
-            print(f"Loaded model from {latest_checkpoint}")
-        except Exception as e:
-            print(f"Failed to load model from {model_path}: {str(e)}")
-
-    return models
-
-    
\ No newline at end of file
diff --git a/checkthat/task1/test_scripts/run_tests.py b/checkthat/task1/test_scripts/run_tests.py
index d4b101d..1768771 100644
--- a/checkthat/task1/test_scripts/run_tests.py
+++ b/checkthat/task1/test_scripts/run_tests.py
@@ -1,34 +1,95 @@
 import torch
-import wandb
-from tokenization.tokenizer import TextDataset
+import pandas as pd
 from models.custom_model import CustomModel
+from tokenization.tokenizer import TextDataset
+import wandb
 from metrics.compute_metrics import compute_metrics
 
-def run_testing(model_name, dataset, tokenizer, label_map, model_named_trained):
-    """Run testing on the given model and dataset."""
-     
+
+import os
+from transformers import AutoModelForSequenceClassification
+import torch
+
+def find_latest_checkpoint(model_dir):
+    """Find the latest checkpoint in the given directory."""
+    checkpoint_dirs = [os.path.join(model_dir, d) for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) and 'checkpoint' in d]
+    if not checkpoint_dirs:
+        raise ValueError("No checkpoint directories found in the given model directory.")
+    latest_checkpoint = sorted(checkpoint_dirs, key=lambda x: int(x.split('-')[-1]))[-1]
+    return latest_checkpoint
+
+def find_models_with_checkpoints(base_dir):
+    """Find models and their latest checkpoints in a structured directory.
+    Args:
+        base_dir (str): Directory containing subdirectories of models
+    Returns:
+        model_info (list): List of tuples containing model name and path to the latest checkpoint.
+    """
+    model_info = []
+    for model_name in os.listdir(base_dir):
+        model_path = os.path.join(base_dir, model_name)
+        if not os.path.isdir(model_path):
+            continue
+        
+        try:
+            latest_checkpoint = find_latest_checkpoint(model_path)
+            model_info.append((model_name, latest_checkpoint))
+            print(f"Model: {model_name}, Latest Checkpoint: {latest_checkpoint}")
+        except Exception as e:
+            print(f"Failed to find checkpoint for model {model_name}: {str(e)}")
+
+    return model_info
+
+
+
+def run_prediction(model_name, dataset, tokenizer, model_named_trained, has_labels: bool):
+    """Run prediction on the dataset, compute metrics if labels are present, and write results to a .tsv file."""
+    
+    device = 'cuda'
+    label_map = {0: 'no', 1: 'yes'}  # Ensure this mapping is correct for your model
+
+    # Initialize Weights & Biases
     run_name = f"TEST__{model_named_trained}"
     wandb.init(project="Clef2024", entity="aarnes", name=run_name)
 
-    # Assuming TextDataset provides the input in the correct format
-    test_dataset = TextDataset(dataset["test"], tokenizer, label_map)
-    model = CustomModel(model_name=model_name, num_labels=len(label_map), device='cuda')
+    model = CustomModel(model_name=model_name, num_labels=len(label_map), device=device)
     model.eval()
 
-    logits = []
-    labels = []
+    # Load the dataset, with or without labels
+    test_dataset = TextDataset(dataset["test"], tokenizer, None if not has_labels else label_map)
+    all_logits = []
+    all_labels = []
+    results = []
 
     with torch.no_grad():
-        for batch in test_dataset:
-            input_ids, attention_mask, label = batch['input_ids'].to('cuda'), batch['attention_mask'].to('cuda'), batch['labels'].to('cuda')
+        for i, batch in enumerate(test_dataset):
+            input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
             output = model(input_ids=input_ids, attention_mask=attention_mask)
-            logits.append(output.logits)  # Adjust according to how outputs are structured
-            labels.append(label)
+            logits = output.logits
+            predictions = logits.argmax(-1).cpu().numpy()
+
+            # Collect logits and labels for metric calculation if labels are present
+            if has_labels and 'labels' in batch:
+                labels = batch['labels'].cpu().numpy()
+                all_logits.append(logits)
+                all_labels.append(torch.tensor(labels))
+                for label, pred in zip(labels, predictions):
+                    results.append((i, label_map[pred], model_named_trained))
+            else:
+                for pred in predictions:
+                    results.append((i, label_map[pred], model_named_trained))
 
-        logits = torch.cat(logits)
-        labels = torch.cat(labels)
-        predictions = logits.argmax(-1)
-        metrics = compute_metrics((predictions, labels))
+    # If labels were present, calculate metrics
+    if has_labels:
+        all_logits = torch.cat(all_logits)
+        all_labels = torch.cat(all_labels)
+        predictions = all_logits.argmax(-1)
+        metrics = compute_metrics((predictions, all_labels))
         wandb.log(metrics)
-        wandb.finish()
 
+    # Save results to a .tsv file
+    df = pd.DataFrame(results, columns=['sentence_id', 'prediction', 'model_name'])
+    df.to_csv(f"{model_named_trained}_predictions.tsv", sep='\t', index=False)
+
+    # Finish Weights & Biases logging
+    wandb.finish()
diff --git a/checkthat/task1/tokenization/tokenizer.py b/checkthat/task1/tokenization/tokenizer.py
index b4e5848..27c9303 100644
--- a/checkthat/task1/tokenization/tokenizer.py
+++ b/checkthat/task1/tokenization/tokenizer.py
@@ -1,16 +1,16 @@
-"""Tokenizer for the task1 datasets."""
 import torch
 from torch.utils.data import Dataset
 
-
 class TextDataset(Dataset):
-    """Takes a list of dictionaries containing text and class labels.
+    """Takes a list of dictionaries containing text and optionally class labels.
 
     Args:
-        Dataset: Dataset class from torch.utils.data
+        data (list): A list of dictionaries with keys 'Text' and optionally 'class_label'.
+        tokenizer: Tokenizer instance for text processing.
+        label_map (dict, optional): A dictionary mapping class labels to integers. None if unlabeled.
     """
 
-    def __init__(self, data, tokenizer, label_map):
+    def __init__(self, data, tokenizer, label_map=None):
         """Initialize the TextDataset class."""
         self.data = data
         self.tokenizer = tokenizer
@@ -21,8 +21,8 @@ def __len__(self):
         return len(self.data)
 
     def __getitem__(self, idx):
-        """Tokenize the text and return a dictionary containing the
-        tokenized."""
+        """Tokenize the text and return a dictionary containing the tokenized data.
+        If labels are present, include them, otherwise only return inputs."""
         item = self.data[idx]
         encoded = self.tokenizer.encode_plus(
             item["Text"],
@@ -33,9 +33,13 @@ def __getitem__(self, idx):
             return_tensors="pt",
         )
 
-        label_id = self.label_map[item["class_label"]]
-        return {
+        result = {
             "input_ids": encoded["input_ids"].squeeze(0),
             "attention_mask": encoded["attention_mask"].squeeze(0),
-            "labels": torch.tensor(label_id),
         }
+
+        if 'class_label' in item and self.label_map is not None:
+            label_id = self.label_map[item["class_label"]]
+            result["labels"] = torch.tensor(label_id)
+
+        return result
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
index e59b0e1..4ec4bfc 100644
--- a/checkthat/task1/training_config.yaml
+++ b/checkthat/task1/training_config.yaml
@@ -4,7 +4,7 @@ training_arguments:
   greater_is_better: True
   # eval_steps: 500
   # evaluation_strategy: 'steps'    # evaluate after some number of steps
-  output_dir: './results'          # output directory
+  output_dir: './trained_models'          # output directory
   save_total_limit: 3             # number of maximum checkpoints to save
   num_train_epochs: 100             # number of training epochs
   per_device_train_batch_size: 16 # batch size for training

From ef31efb9d223365f24401c72d17ff347f2f8b087 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Thu, 2 May 2024 17:04:56 +0200
Subject: [PATCH 10/16] Paths fix

---
 checkthat/task1/main_test_all.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/checkthat/task1/main_test_all.py b/checkthat/task1/main_test_all.py
index efe38ba..cd4615b 100644
--- a/checkthat/task1/main_test_all.py
+++ b/checkthat/task1/main_test_all.py
@@ -2,9 +2,9 @@
 import torch
 import pandas as pd
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from task1.models.custom_model import CustomModel
-from task1.tokenization.tokenizer import TextDataset
-from task1.metrics.compute_metrics import compute_metrics
+from models.custom_model import CustomModel
+from tokenization.tokenizer import TextDataset
+from metrics.compute_metrics import compute_metrics
 import wandb
 from tokenization.tokenizer import TextDataset
 
@@ -112,5 +112,5 @@ def run_prediction(model_name, dataset_list, tokenizer, model_path, has_labels:
         tokenized_data = TextDataset(dataset_list.values()[i], tokenizer, label_map)
         run_prediction(model_name, dataset_list, tokenizer, checkpoint_path, has_labels=True)
         i += 1
-        
+
     

From 448b27793371d159982af247cd931043b0499624 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Thu, 2 May 2024 17:20:33 +0200
Subject: [PATCH 11/16] Added slurm

---
 checkthat/task1/main_train_all.py        |  2 --
 checkthat/task1/slurm/conda_setup.sh     | 14 ++++++++++++++
 checkthat/task1/slurm/start_train_all.sh | 15 +++++++++++++++
 checkthat/task1/training_config.yaml     |  2 +-
 4 files changed, 30 insertions(+), 3 deletions(-)
 create mode 100644 checkthat/task1/slurm/conda_setup.sh
 create mode 100644 checkthat/task1/slurm/start_train_all.sh

diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/main_train_all.py
index 6303fce..f9a398d 100644
--- a/checkthat/task1/main_train_all.py
+++ b/checkthat/task1/main_train_all.py
@@ -4,8 +4,6 @@
 from training_scripts.training import run_training
 from transformers import AutoTokenizer
 from training_scripts.train_config import get_training_arguments
-from test_scripts.load_from_checkpoints import find_latest_checkpoint
-from test_scripts.run_tests import run_prediction
 
 def main():
     dataset_list = [
diff --git a/checkthat/task1/slurm/conda_setup.sh b/checkthat/task1/slurm/conda_setup.sh
new file mode 100644
index 0000000..92721eb
--- /dev/null
+++ b/checkthat/task1/slurm/conda_setup.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH --gres=gpu:0
+#SBATCH --partition=gpuA100 
+#SBATCH --time=1:00:00
+#SBATCH --job-name=conda_setup
+#SBATCH --output=conda_setup.out
+
+uenv verbose cuda-12.2.0 cudnn-12.x-8.8.0
+uenv miniconda3-py39
+conda create -n transformer_cuda12 -c pytorch pytorch torchvision torchaudio pytorch-cuda=12.1 -c nvidia -y
+conda activate transformer_cuda12
+pip3 install torch torchvision torchaudio
+pip3 install transformers[torch]
+pip3 install -r requirements.txt
\ No newline at end of file
diff --git a/checkthat/task1/slurm/start_train_all.sh b/checkthat/task1/slurm/start_train_all.sh
new file mode 100644
index 0000000..3781a0d
--- /dev/null
+++ b/checkthat/task1/slurm/start_train_all.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+#SBATCH --gres=gpu:6
+#SBATCH --partition=gpuA100 
+#SBATCH --time=1:00:00
+#SBATCH --job-name=CLEF2024_task1_training
+#SBATCH --output=start_train_all.out
+ 
+# Activate environment
+uenv verbose cuda-12.2.0 cudnn-12.x-8.8.0
+uenv miniconda3-py39
+conda activate transformer_cuda12
+PATH=~/.local/bin:$PATH
+echo $PATH
+# Run the Python script that uses the GPU
+TOKENIZERS_PARALLELISM=false python -u main_train_all.py
\ No newline at end of file
diff --git a/checkthat/task1/training_config.yaml b/checkthat/task1/training_config.yaml
index 4ec4bfc..955fc59 100644
--- a/checkthat/task1/training_config.yaml
+++ b/checkthat/task1/training_config.yaml
@@ -6,7 +6,7 @@ training_arguments:
   # evaluation_strategy: 'steps'    # evaluate after some number of steps
   output_dir: './trained_models'          # output directory
   save_total_limit: 3             # number of maximum checkpoints to save
-  num_train_epochs: 100             # number of training epochs
+  num_train_epochs: 1             # number of training epochs
   per_device_train_batch_size: 16 # batch size for training
   per_device_eval_batch_size: 16  # batch size for evaluation
   warmup_steps: 500               # number of warmup steps for learning rate scheduler

From ec3da9e76bc427dd5e3259393e033f455f4292bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Fri, 3 May 2024 11:22:27 +0200
Subject: [PATCH 12/16] Moving thing around and simplifying requirements.txt

---
 checkthat/task1/{slurm => }/conda_setup.sh     |   0
 checkthat/task1/requirements.txt               | Bin 0 -> 416 bytes
 checkthat/task1/{slurm => }/start_train_all.sh |   0
 requirements.txt                               | Bin 3520 -> 0 bytes
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename checkthat/task1/{slurm => }/conda_setup.sh (100%)
 create mode 100644 checkthat/task1/requirements.txt
 rename checkthat/task1/{slurm => }/start_train_all.sh (100%)
 delete mode 100644 requirements.txt

diff --git a/checkthat/task1/slurm/conda_setup.sh b/checkthat/task1/conda_setup.sh
similarity index 100%
rename from checkthat/task1/slurm/conda_setup.sh
rename to checkthat/task1/conda_setup.sh
diff --git a/checkthat/task1/requirements.txt b/checkthat/task1/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..54eeae829d4928f2fcc2b3e8983e5d72456cf401
GIT binary patch
literal 416
zcmY*V%L>9U5S+8%rzm<UUi=5IA|k0xvGhq(6@OlxowQg93A;P9vpe~`Q&e!cqQnvl
z<jP8T?38zSVPF>c8eTKEn#CDf_T_wg)*Ttf?8m;I@rrm$tVS{RTW{<$#TTb}?0FKb
zvDJ;rp7jnFa(XH^oNrqWnq`a_U2)U2U*>S9enH+p>%m%3y%~{7ciP`J4>-+~&te98
zDEK?(V5NN>nNcOvPL6$^Xb}IYeKVx;wUH-tmk2%JNY{=(QYoLu9uFcF9hg%u{0p4i
BN0a~n

literal 0
HcmV?d00001

diff --git a/checkthat/task1/slurm/start_train_all.sh b/checkthat/task1/start_train_all.sh
similarity index 100%
rename from checkthat/task1/slurm/start_train_all.sh
rename to checkthat/task1/start_train_all.sh
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 5d2c97de202f66d51c1e76a6f556f6e2f28e23e0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3520
zcmZ{nOOF#t5QOWDw7&!y;r1{LhdnHpl~y1japJV{8-pK>+lHC>@$ANz(dlZ>va*JG
zWL9QmWMo$H&(CR@>$5CdeI3iRob>&7eO{Iqr73Up@1*=zp2@N)hw@qWQ)$CvA{+Ks
z1A8a0b=k{%8$RPW|Due0pH^oV@;R37V!x5kO>ghYgHF%ONLe;zCoG3@*3Tfq4^)W<
zBpIjCU%`t9XlFV(jgy<Q()X?M5vf&_j5Q8_N1Mq5CccKXiCpAsBHCN!IVjs!m^e*U
zAbAs3*gFQ^Rc~)(bqo+W##}ho<$Ih5tE&O)T&zwNVWBwJmAaO+cVQoipM$)2!SSX%
zMs`rS`s8vRXtSa_Wd!x>9`Qj|qD&(Wr>O@_x8bpn<)km?bgzg%j*c4_c$Z?7UJ+?8
zZ&*A<<m71_dEH++ztt!6!NXmjZ==&@c=k*CK|HN4mD77z&n&M38*DD*L)BPELieEG
zj{0q(_?fqIzf?!i>N#*%<=5UX%e^8{CyG3%%C+KIOPSrwkR1guNwAyTQ{_1H=_y#r
zIt_$(f#|jDuy0mZ5uL8?<!4qh52!Lmr+Qj>%_D{t=S(%$TIWxy+>E+vA~?pMVf`S6
zPEsJ1k;gI6f+yp%xLND7iON?(4jUt#IQMbM*M$14^i3#vAAGG8ov1KGk0!mo@L@}2
zM(cx#Z-s<uqV}2cumezHDv}e)nCk%9MyRL&{@0g`m5ILs{};t+E;+rM=-L;ZWdo3r
z_*kk&-s{v+{bGB3)Xys+%9`->{@V$`Tvj#@R&pCB8K{eih3U)y)9(3BvA4k&6EfFN
z_}qrQ#!sI%6har!K+Q$QanJ8gyrcchH6AD|Z6igvS1y#s-iE)rl`D69d7R3R=*w4~
zAST$W>e)!%2K#=)!3#Ub_3c$33{*T#ROha@;)Pk6UQ|Yg+~}}}$37HLHxb#bH>~hF
zoCkAJ5w@z2&2vy4<^+%0fmSx6<c1mEQQiePd3bsvYBea^mC_rg12yjSVg8b@ss?wE
zJvmdXRd_gyXT&vkvaxf#FR^gjdSAjNs$>&oMX8rWGW1|xxxttZcaZwM>xl(kI>-)w
zP{yIc>7^$PPw&@jKQdo?)g9-KrCyBfgW`NtM!H@V2R}6rx+Dwi<?Z!fEBoQ;rmd3;
z-rQ}Y&<|0EtfCNDg$*_j87qG$-LBSR-seUD{!itP^1i$adoph;3RvsQJU`vc-m3U~
z6B5^0RXqtFji6J9*)=(#>_vNm)_OrHop{`cNwhYK-k}U{bGq`>6IMI}@LgwsyPmCm
zlzpVA3&l*oo7moo3rx9%P$=8liXk5nT_sz^ELa`p$(WgrsiKg<^ON20gaTF1WM^jS
z$ug!XZ+y>w&Ro9i$i{tMwd>8yoAP@5@bUu%+MpBME_9z)5!=@jM;%p5Oe?Pn?i#<j
zYA=SbBK&rJ1p4I7oB3?>E0QYgVJ!5Qx1JT}{o0i0`rBDi8hu^+@xEnG9iE)H_WCZ5
z-0nnnMrJKBeN&WO#Gu;SL8t4xF83lGaNYRcL^P{{y}^5=lXm`pKHND^(@Uu&-y%Nk
zH^$$J^6PtvoPJMc&PQQqZ~G-t-`(cw9D2U)Tje12c2-U3jN5;5;S$9<3k>oi;f;vS
z^Nu4MEPDIF`VV<<Pt_MJ_i^sodsW3tzw3LP`Z88;Yb%f3Y$e~1kIK*gn?TN)@!(zY
EFOOvTmH+?%


From 7e9d2f04fecf7214a4f3d2432d6f17c4cf0b0903 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?= <peter@Peters-Air-2.home>
Date: Fri, 3 May 2024 19:45:21 +0200
Subject: [PATCH 13/16] Tweaks so model and tokenizer get's saved

---
 checkthat/task1/conda_setup.sh                |   2 +-
 checkthat/task1/requirements.txt              | Bin 416 -> 404 bytes
 checkthat/task1/test_load.py                  |  17 ++++
 checkthat/task1/test_scripts/__init__.py      |   0
 .../test_scripts/load_from_checkpoints.py     |   0
 checkthat/task1/test_scripts/run_tests.py     |  95 ------------------
 checkthat/task1/training_scripts/training.py  |   4 +
 7 files changed, 22 insertions(+), 96 deletions(-)
 create mode 100644 checkthat/task1/test_load.py
 delete mode 100644 checkthat/task1/test_scripts/__init__.py
 delete mode 100644 checkthat/task1/test_scripts/load_from_checkpoints.py
 delete mode 100644 checkthat/task1/test_scripts/run_tests.py

diff --git a/checkthat/task1/conda_setup.sh b/checkthat/task1/conda_setup.sh
index 92721eb..a8e1890 100644
--- a/checkthat/task1/conda_setup.sh
+++ b/checkthat/task1/conda_setup.sh
@@ -8,7 +8,7 @@
 uenv verbose cuda-12.2.0 cudnn-12.x-8.8.0
 uenv miniconda3-py39
 conda create -n transformer_cuda12 -c pytorch pytorch torchvision torchaudio pytorch-cuda=12.1 -c nvidia -y
-conda activate transformer_cuda12
+conda activate transformer_cudlsa12
 pip3 install torch torchvision torchaudio
 pip3 install transformers[torch]
 pip3 install -r requirements.txt
\ No newline at end of file
diff --git a/checkthat/task1/requirements.txt b/checkthat/task1/requirements.txt
index 54eeae829d4928f2fcc2b3e8983e5d72456cf401..73315d450939b1db79a5e93494b33502eba93f4c 100644
GIT binary patch
delta 11
ScmZ3$JcW5f1LNc#MiBrQFaw|f

delta 11
ScmbQjynuN_1LNcpMg;&D+yj&V

diff --git a/checkthat/task1/test_load.py b/checkthat/task1/test_load.py
new file mode 100644
index 0000000..739dc20
--- /dev/null
+++ b/checkthat/task1/test_load.py
@@ -0,0 +1,17 @@
+import torch
+from models.custom_model import CustomModel
+from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
+
+path = "checkthat/task1/test_model_xlm_roberta/model.safetensors"
+
+print("test")
+print(path)
+
+model = CustomModel("xlm-roberta-base", 2, "cpu")
+model.load_state_dict(torch.load(path, map_location="cpu"))
+
+
+# config = AutoConfig.from_pretrained(path)
+# tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+# model = AutoModelForSequenceClassification.from_pretrained(path, config=config)
+
diff --git a/checkthat/task1/test_scripts/__init__.py b/checkthat/task1/test_scripts/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/checkthat/task1/test_scripts/load_from_checkpoints.py b/checkthat/task1/test_scripts/load_from_checkpoints.py
deleted file mode 100644
index e69de29..0000000
diff --git a/checkthat/task1/test_scripts/run_tests.py b/checkthat/task1/test_scripts/run_tests.py
deleted file mode 100644
index 1768771..0000000
--- a/checkthat/task1/test_scripts/run_tests.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import torch
-import pandas as pd
-from models.custom_model import CustomModel
-from tokenization.tokenizer import TextDataset
-import wandb
-from metrics.compute_metrics import compute_metrics
-
-
-import os
-from transformers import AutoModelForSequenceClassification
-import torch
-
-def find_latest_checkpoint(model_dir):
-    """Find the latest checkpoint in the given directory."""
-    checkpoint_dirs = [os.path.join(model_dir, d) for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) and 'checkpoint' in d]
-    if not checkpoint_dirs:
-        raise ValueError("No checkpoint directories found in the given model directory.")
-    latest_checkpoint = sorted(checkpoint_dirs, key=lambda x: int(x.split('-')[-1]))[-1]
-    return latest_checkpoint
-
-def find_models_with_checkpoints(base_dir):
-    """Find models and their latest checkpoints in a structured directory.
-    Args:
-        base_dir (str): Directory containing subdirectories of models
-    Returns:
-        model_info (list): List of tuples containing model name and path to the latest checkpoint.
-    """
-    model_info = []
-    for model_name in os.listdir(base_dir):
-        model_path = os.path.join(base_dir, model_name)
-        if not os.path.isdir(model_path):
-            continue
-        
-        try:
-            latest_checkpoint = find_latest_checkpoint(model_path)
-            model_info.append((model_name, latest_checkpoint))
-            print(f"Model: {model_name}, Latest Checkpoint: {latest_checkpoint}")
-        except Exception as e:
-            print(f"Failed to find checkpoint for model {model_name}: {str(e)}")
-
-    return model_info
-
-
-
-def run_prediction(model_name, dataset, tokenizer, model_named_trained, has_labels: bool):
-    """Run prediction on the dataset, compute metrics if labels are present, and write results to a .tsv file."""
-    
-    device = 'cuda'
-    label_map = {0: 'no', 1: 'yes'}  # Ensure this mapping is correct for your model
-
-    # Initialize Weights & Biases
-    run_name = f"TEST__{model_named_trained}"
-    wandb.init(project="Clef2024", entity="aarnes", name=run_name)
-
-    model = CustomModel(model_name=model_name, num_labels=len(label_map), device=device)
-    model.eval()
-
-    # Load the dataset, with or without labels
-    test_dataset = TextDataset(dataset["test"], tokenizer, None if not has_labels else label_map)
-    all_logits = []
-    all_labels = []
-    results = []
-
-    with torch.no_grad():
-        for i, batch in enumerate(test_dataset):
-            input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
-            output = model(input_ids=input_ids, attention_mask=attention_mask)
-            logits = output.logits
-            predictions = logits.argmax(-1).cpu().numpy()
-
-            # Collect logits and labels for metric calculation if labels are present
-            if has_labels and 'labels' in batch:
-                labels = batch['labels'].cpu().numpy()
-                all_logits.append(logits)
-                all_labels.append(torch.tensor(labels))
-                for label, pred in zip(labels, predictions):
-                    results.append((i, label_map[pred], model_named_trained))
-            else:
-                for pred in predictions:
-                    results.append((i, label_map[pred], model_named_trained))
-
-    # If labels were present, calculate metrics
-    if has_labels:
-        all_logits = torch.cat(all_logits)
-        all_labels = torch.cat(all_labels)
-        predictions = all_logits.argmax(-1)
-        metrics = compute_metrics((predictions, all_labels))
-        wandb.log(metrics)
-
-    # Save results to a .tsv file
-    df = pd.DataFrame(results, columns=['sentence_id', 'prediction', 'model_name'])
-    df.to_csv(f"{model_named_trained}_predictions.tsv", sep='\t', index=False)
-
-    # Finish Weights & Biases logging
-    wandb.finish()
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index cc23944..b2f3833 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -68,6 +68,10 @@ def run_training(seed, dataset, model, tokenizer, label_map, training_arguments,
 
     # Train the model
     trainer.train()
+    model.save_pretrained("./trained_models")
+
+    # Save the tokenizer
+    tokenizer.save_pretrained("./trained_models")
 
     # Finish the wandb run after each seed
     wandb.finish()

From 06c4ac3c1d265fdbbb007d17e515dfedde6a72bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?= <peter@Peters-Air-2.home>
Date: Sun, 5 May 2024 21:05:34 +0200
Subject: [PATCH 14/16] Implemented sweep functionality for training

---
 checkthat/task1/auth.sh                       |  37 +++++++
 checkthat/task1/conda_setup.sh                |  19 ++--
 .../{ => depricated_scripts}/main_test_all.py |   0
 .../main_train_all.py                         |   0
 checkthat/task1/main.py                       |  28 +++---
 checkthat/task1/requirements.txt              | Bin 404 -> 434 bytes
 checkthat/task1/start_train.sh                |  29 ++++++
 checkthat/task1/sweep.yaml                    |  29 ++++++
 checkthat/task1/test_load.py                  |  17 ----
 checkthat/task1/training_scripts/training.py  |  95 ++++++++++--------
 10 files changed, 177 insertions(+), 77 deletions(-)
 create mode 100644 checkthat/task1/auth.sh
 rename checkthat/task1/{ => depricated_scripts}/main_test_all.py (100%)
 rename checkthat/task1/{ => depricated_scripts}/main_train_all.py (100%)
 create mode 100644 checkthat/task1/start_train.sh
 create mode 100644 checkthat/task1/sweep.yaml
 delete mode 100644 checkthat/task1/test_load.py

diff --git a/checkthat/task1/auth.sh b/checkthat/task1/auth.sh
new file mode 100644
index 0000000..cbbf433
--- /dev/null
+++ b/checkthat/task1/auth.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#SBATCH --gres=gpu:0
+#SBATCH --partition=gpuA100
+#SBATCH --time=1:00:00
+#SBATCH --job-name=setup_authenticator.sh
+#SBATCH --output=hf_test.out
+# Load necessary modules, if required
+# module load python/3.9  # Adjust this according to your environment
+
+# Activate your Python environment
+# source ~/bhome/env/checkthat2024_env/bin/activate
+
+# Explicitly specify the path to the correct Python executable
+# PYTHON="~/bhome/env/checkthat2024_env/bin/"
+PYTHON="~/.conda/envs/CLEF_checkthat2024/bin"
+uenv miniconda3-py39
+
+# Activate the Conda environment
+#conda activate ~/bhome/env/checkthat2024_env
+conda activate ~/.conda/envs/CLEF_checkthat2024
+
+export HF_HOME=~/bhome/clef2024-checkthat/checkthat/task1
+# Create necessary directories
+mkdir -p $HF_HOME $WANDB_CACHE_DIR
+
+# Store the Hugging Face token
+echo 'KEY' > $HF_HOME/token
+chmod 600 $HF_HOME/token
+
+# Log in to wandb
+export WANDB_API_KEY='KEY'
+export WANDB_CACHE_DIR=~/bhome/clef2024-checkthat/checkthat/task1
+wandb login KEY
+
+# Test the Hugging Face API with a Python script
+#$PYTHON test_start.py
+python -u test_hf_login.py
diff --git a/checkthat/task1/conda_setup.sh b/checkthat/task1/conda_setup.sh
index a8e1890..6305e07 100644
--- a/checkthat/task1/conda_setup.sh
+++ b/checkthat/task1/conda_setup.sh
@@ -1,14 +1,21 @@
 #!/bin/bash
 #SBATCH --gres=gpu:0
-#SBATCH --partition=gpuA100 
+#SBATCH --partition=gpuA100
 #SBATCH --time=1:00:00
 #SBATCH --job-name=conda_setup
 #SBATCH --output=conda_setup.out
 
+module load cuda/12.2.0 cudnn/8.8.0 # Load CUDA and cuDNN modules
+
+# Activate the user environment (uenv)
 uenv verbose cuda-12.2.0 cudnn-12.x-8.8.0
 uenv miniconda3-py39
-conda create -n transformer_cuda12 -c pytorch pytorch torchvision torchaudio pytorch-cuda=12.1 -c nvidia -y
-conda activate transformer_cudlsa12
-pip3 install torch torchvision torchaudio
-pip3 install transformers[torch]
-pip3 install -r requirements.txt
\ No newline at end of file
+
+# Create and activate the Conda environment
+conda create -n CLEF_checkthat2024 -c pytorch pytorch torchvision torchaudio pytorch-cuda=12.1 -c nvidia -y
+conda activate CLEF_checkthat2024
+
+# Install Python packages
+pip install torch torchvision torchaudio
+pip install transformers[torch]
+pip install -r requirements.txt
\ No newline at end of file
diff --git a/checkthat/task1/main_test_all.py b/checkthat/task1/depricated_scripts/main_test_all.py
similarity index 100%
rename from checkthat/task1/main_test_all.py
rename to checkthat/task1/depricated_scripts/main_test_all.py
diff --git a/checkthat/task1/main_train_all.py b/checkthat/task1/depricated_scripts/main_train_all.py
similarity index 100%
rename from checkthat/task1/main_train_all.py
rename to checkthat/task1/depricated_scripts/main_train_all.py
diff --git a/checkthat/task1/main.py b/checkthat/task1/main.py
index cc7ff58..738f2fe 100644
--- a/checkthat/task1/main.py
+++ b/checkthat/task1/main.py
@@ -1,4 +1,4 @@
-"""Will run script to run training and testing. (test yet to be implemented)
+"""Will run script to run training and testing. (unlabeled tests yet to ble implemented)
 
 Argument parser is used to specify the model name and dataset name.
 """
@@ -6,31 +6,33 @@
 from datasets import load_dataset
 from training_scripts.training import run_training
 from transformers import AutoTokenizer
+from tokenization.tokenizer import TextDataset
 
 
 def main(args):
     """Run training."""
+    label_map = {"Yes": 1, "No": 0}
+
     tokenizer = AutoTokenizer.from_pretrained(args.model_name)
 
     dataset = load_dataset(args.dataset)
-    label_map = {"No": 0, "Yes": 1}  # Label map for the dataset
 
-    seeds = [42, 81, 1024, 6, 10]  # Seeds for reproducibility
-    if args.train:
-        for seed in seeds:
-            run_training(seed, dataset, args.model_name, tokenizer, label_map)
+    dataset_language = args.dataset.split("_")[-2:]
+
+    train_dataset = TextDataset(dataset["train"], tokenizer, label_map)
+    eval_dataset = TextDataset(dataset["validation"], tokenizer, label_map)
+    test_dataset = TextDataset(dataset["test"], tokenizer, label_map)
+
+    run_training(train_dataset, eval_dataset, args.model_name, label_map, dataset_language, test_dataset)
 
 
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Run training and testing.")
 
-    parser.add_argument(
-        "--train", action="store_true", help="Whether to run training"
-    )
-    parser.add_argument(
-        "--test", action="store_true", help="Whether to run testing"
-    )
+    # parser.add_argument(
+    #     "--test", action="store_true", help="Whether to run testing"
+    # )
     parser.add_argument(
         "--model_name",
         type=str,
@@ -41,7 +43,7 @@ def main(args):
         "--dataset",
         type=str,
         default="iai-group/clef2024_checkthat_task1_en",  # For English language
-        help="Name of the dataset",
+        help="Name of the dataset from the iai-group/clef2024_checkthat_task1_* datasets",
     )
 
     args = parser.parse_args()
diff --git a/checkthat/task1/requirements.txt b/checkthat/task1/requirements.txt
index 73315d450939b1db79a5e93494b33502eba93f4c..34938acef0ef25d67c97c989227235f80a7bfaa8 100644
GIT binary patch
delta 38
ocmbQjyoq_k6h^rWhEj%fAk1XQ1CnVBi44gMsSLUdAn_yy0KVP`lmGw#

delta 7
OcmdnQJcW6~6h;6GAOeyA

diff --git a/checkthat/task1/start_train.sh b/checkthat/task1/start_train.sh
new file mode 100644
index 0000000..f2dd707
--- /dev/null
+++ b/checkthat/task1/start_train.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH --gres=gpu:1
+#SBATCH --partition=gpuA100
+#SBATCH --time=1:00:00
+#SBATCH --job-name=CLEF2024_task1_training
+#SBATCH --output=start_train_all.out
+
+# Load CUDA and cuDNN modules
+module load cuda/12.2.0 cudnn/8.8.0
+
+# Activate the user environment (uenv)
+uenv verbose cuda-12.2.0 cudnn-12.x-8.8.0
+uenv miniconda3-py39
+
+# Activate the Conda environment
+#conda activate ~/bhome/env/checkthat2024_env
+
+conda ~/.conda/envs/CLEF_checkthat2024
+
+
+# Add user's local bin directory to the PATH
+PATH=~/.local/bin:$PATH
+echo $PATH
+
+# Disable tokenizers parallelism for better GPU utilization
+export TOKENIZERS_PARALLELISM=false
+
+# Run the Python script that uses the GPU
+python -u main_train_all.py
\ No newline at end of file
diff --git a/checkthat/task1/sweep.yaml b/checkthat/task1/sweep.yaml
new file mode 100644
index 0000000..f4a5e5d
--- /dev/null
+++ b/checkthat/task1/sweep.yaml
@@ -0,0 +1,29 @@
+# Porgram to run
+program: main.py
+
+# Sweep method can be grid, random, bayesian
+method: random
+
+# Project for sweep
+project: testsweep
+entity: iai-group
+
+# Metrics to optimize
+metric:
+  name: f1,
+  goal: maximize
+
+
+parameters:
+  metric_for_best_model:
+    values: ["f1"]
+  hidden_dropout_prob:
+    values: [0.1, 0.2, 0.3]  # Define discrete steps for grid search
+  epochs:
+    values: [10, 20, 50]  # Convert range to discrete values
+  batch_size:
+    values: [16, 32]
+  learning_rate:
+    values: [0.000025, 0.00005, 0.000075, 0.0001]  # Define steps for learning rate
+seed:
+  values: [42, 123, 2023, 1002, 95]
\ No newline at end of file
diff --git a/checkthat/task1/test_load.py b/checkthat/task1/test_load.py
deleted file mode 100644
index 739dc20..0000000
--- a/checkthat/task1/test_load.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import torch
-from models.custom_model import CustomModel
-from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
-
-path = "checkthat/task1/test_model_xlm_roberta/model.safetensors"
-
-print("test")
-print(path)
-
-model = CustomModel("xlm-roberta-base", 2, "cpu")
-model.load_state_dict(torch.load(path, map_location="cpu"))
-
-
-# config = AutoConfig.from_pretrained(path)
-# tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
-# model = AutoModelForSequenceClassification.from_pretrained(path, config=config)
-
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index b2f3833..0f1e10b 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -9,69 +9,82 @@
 from metrics.compute_metrics import compute_metrics
 from training_scripts.train_config import get_training_arguments
 from training_scripts.train_config import get_language
-import random
 import numpy as np
 import torch
 import torch.cuda
 import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
 
-def set_seed(seed):
-    """Set seed for reproducibility."""
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
 
-def run_training(seed, dataset, model, tokenizer, label_map, training_arguments, dataset_language):
-    """Start training the model for a single seed.
 
-    Args:
-        seed: seed for reproducibility
-        dataset: dataset dictionary containing train and validation splits
-        model: huggingface model name
-        tokenizer: huggerface tokenizer/same as model name
-        label_map: dictionary mapping labels to integers
-    """
-    # Initialize wandb run
-    set_seed(seed)
-    run_name = f"{model}_{seed}_{dataset_language}"
-    wandb.init(
-        project="Clef2024",
+def run_training(train_dataset, eval_dataset, model_name, label_map, dataset_language, test_dataset=None,):
+    """Run training sweep. Evaluate on validation set and test set."""
+    
+    run_name = wandb.init(
+        project="sweep_test",
         entity="aarnes",
-        name=run_name,
-        config={"seed": seed},
-    )   
+        reinit=True
+    ).name
 
-    # Prepare datasets
-    train_dataset = TextDataset(dataset["train"], tokenizer, label_map)
-    eval_dataset = TextDataset(dataset["validation"], tokenizer, label_map)
+    # Load model and tokenizer from Hugging Face
+    hf_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))
+    hf_tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-    # training_arguments = get_training_arguments()
-    training_arguments.run_name = (
-        run_name  # Optional, sync the name with Trainer's internal wandb run
+
+    # Define training arguments
+    training_arguments = TrainingArguments(
+        output_dir="./results",  # Directory to save model and tokenizer
+        evaluation_strategy="epoch",
+        learning_rate=wandb.config.learning_rate,
+        per_device_train_batch_size=wandb.config.batch_size,
+        num_train_epochs=wandb.config.epochs,
+        logging_dir='./logs',
+        logging_steps=10,
+        do_train=True,
+        do_eval=True,
+        load_best_model_at_end=True,
+        save_strategy="epoch",  # Save model at the end of each epoch
+        save_total_limit=1,  # Optional: limits the total amount of checkpoints, deleting older
+        report_to="wandb",
+        run_name=run_name,
     )
 
-    # Creating a Trainer instance with training arguments and datasets
+    # Create a Trainer instance
     trainer = Trainer(
-        model=CustomModel(model, num_labels=len(label_map), device='cuda'),
+        model=hf_model,
         args=training_arguments,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        compute_metrics=compute_metrics(),
-        callbacks=[
-            EarlyStoppingCallback(early_stopping_patience=3)
-        ],  # Early stopping callback
+        compute_metrics=compute_metrics,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
     )
 
     # Train the model
     trainer.train()
-    model.save_pretrained("./trained_models")
+    # Evaluate the model on the test dataset
+    test_output = trainer.predict(test_dataset)
+    test_results = {f"test_{k}": v for k, v in test_output.metrics.items()}
+
+    
+
+    # Evaluate the model
+    eval_results = trainer.evaluate()
+    
+    # Log evaluation and test results to W&B
+    wandb.log({"eval_results": eval_results})
+    wandb.log({"test_results": test_results})
 
-    # Save the tokenizer
-    tokenizer.save_pretrained("./trained_models")
+    # Save model and tokenizer at the end of training
+    model_path = f"{training_arguments.output_dir}/{run_name}_model_{dataset_language}"
+    tokenizer_path = f"{training_arguments.output_dir}/{run_name}_tokenizer_{dataset_language}"
 
-    # Finish the wandb run after each seed
+    hf_model.save_pretrained(model_path)
+    hf_tokenizer.save_pretrained(tokenizer_path)
+
+    # Ensure the W&B run is finished
     wandb.finish()
+
+    # Return paths for model and tokenizer for user reference
+    return model_path, tokenizer_path
\ No newline at end of file

From 9476daeba1f4084ca5676f290f26ec8c3de2cc26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Mon, 6 May 2024 10:55:05 +0200
Subject: [PATCH 15/16] Restructuring folders (simplified compute metrics).
 Minor alterations to training scripts.

---
 .../metrics/__init__.py                       |  0
 .../metrics/compute_metrics.py                |  0
 .../metrics/metrics_logger.py                 |  6 +--
 checkthat/task1/main.py                       |  2 +-
 checkthat/task1/start_train.sh                | 50 ++++++++++++++++---
 checkthat/task1/training_scripts/training.py  | 31 ++++++++----
 6 files changed, 70 insertions(+), 19 deletions(-)
 rename checkthat/task1/{ => depricated_scripts}/metrics/__init__.py (100%)
 rename checkthat/task1/{ => depricated_scripts}/metrics/compute_metrics.py (100%)
 rename checkthat/task1/{ => depricated_scripts}/metrics/metrics_logger.py (84%)

diff --git a/checkthat/task1/metrics/__init__.py b/checkthat/task1/depricated_scripts/metrics/__init__.py
similarity index 100%
rename from checkthat/task1/metrics/__init__.py
rename to checkthat/task1/depricated_scripts/metrics/__init__.py
diff --git a/checkthat/task1/metrics/compute_metrics.py b/checkthat/task1/depricated_scripts/metrics/compute_metrics.py
similarity index 100%
rename from checkthat/task1/metrics/compute_metrics.py
rename to checkthat/task1/depricated_scripts/metrics/compute_metrics.py
diff --git a/checkthat/task1/metrics/metrics_logger.py b/checkthat/task1/depricated_scripts/metrics/metrics_logger.py
similarity index 84%
rename from checkthat/task1/metrics/metrics_logger.py
rename to checkthat/task1/depricated_scripts/metrics/metrics_logger.py
index 309b438..c8a2e4d 100644
--- a/checkthat/task1/metrics/metrics_logger.py
+++ b/checkthat/task1/depricated_scripts/metrics/metrics_logger.py
@@ -20,9 +20,9 @@ def compute_custom_metrics(logits, labels):
     predictions = np.argmax(logits, axis=1)  # Convert logits to predictions
 
     # Calculate metrics
-    precision = precision_score(labels, predictions, average="macro", pos_label=1)
-    recall = recall_score(labels, predictions, average="macro", pos_label=1)
-    f1 = f1_score(labels, predictions, average="macro", pos_label=1)
+    precision = precision_score(labels, predictions, average="binary", pos_label=1)
+    recall = recall_score(labels, predictions, average="binary", pos_label=1)
+    f1 = f1_score(labels, predictions, average="binary", pos_label=1)
 
     return precision, recall, f1
 
diff --git a/checkthat/task1/main.py b/checkthat/task1/main.py
index 738f2fe..f7394f4 100644
--- a/checkthat/task1/main.py
+++ b/checkthat/task1/main.py
@@ -45,6 +45,6 @@ def main(args):
         default="iai-group/clef2024_checkthat_task1_en",  # For English language
         help="Name of the dataset from the iai-group/clef2024_checkthat_task1_* datasets",
     )
-
+    
     args = parser.parse_args()
     main(args)
diff --git a/checkthat/task1/start_train.sh b/checkthat/task1/start_train.sh
index f2dd707..2e16f63 100644
--- a/checkthat/task1/start_train.sh
+++ b/checkthat/task1/start_train.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
-#SBATCH --gres=gpu:1
+#SBATCH --gres=gpu:8
 #SBATCH --partition=gpuA100
-#SBATCH --time=1:00:00
-#SBATCH --job-name=CLEF2024_task1_training
-#SBATCH --output=start_train_all.out
+#SBATCH --time=24:00:00
+#SBATCH --job-name=checkthat_training
+#SBATCH --output=checkthat_training.out
 
 # Load CUDA and cuDNN modules
 module load cuda/12.2.0 cudnn/8.8.0
@@ -25,5 +25,43 @@ echo $PATH
 # Disable tokenizers parallelism for better GPU utilization
 export TOKENIZERS_PARALLELISM=false
 
-# Run the Python script that uses the GPU
-python -u main_train_all.py
\ No newline at end of file
+PROJECT_NAME="EN-SWEEP-no-data-alter" 
+
+run_sweep_and_agent () {
+  # Ensure the PROJECT_NAME environment variable is set
+  if [[ -z "$PROJECT_NAME" ]]; then
+    echo "Error: PROJECT_NAME must be set."
+    return 1
+  fi
+  
+  echo "Initializing sweep using sweep.yaml in project: $PROJECT_NAME..."
+  
+  # Run the wandb sweep command using a fixed file path
+  wandb sweep --project "$PROJECT_NAME" "sweep.yaml" > temp_output.txt 2>&1
+  
+  # Check if the wandb sweep command succeeded
+  if [ $? -ne 0 ]; then
+    echo "Error: Failed to initialize sweep. See output below:"
+    cat temp_output.txt
+    return 1
+  fi
+
+  # Extract the sweep ID using awk
+  SWEEP_ID=$(awk '/wandb agent/{ match($0, /wandb agent (.+)/, arr); print arr[1]; }' temp_output.txt)
+  
+  # Check if the sweep ID was extracted successfully
+  if [[ -z "$SWEEP_ID" ]]; then
+    echo "Error: Failed to extract sweep ID from output."
+    cat temp_output.txt
+    return 1
+  fi
+
+  # Cleanup: Remove the temporary output file
+  rm temp_output.txt
+  
+  # Run the wandb agent command
+  echo "Starting wandb agent for sweep ID: $SWEEP_ID"
+  wandb agent $SWEEP_ID
+}
+
+run_sweep_and_agent = EN-SWEEP-no-data-alter
\ No newline at end of file
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index 0f1e10b..dd314bb 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -3,12 +3,16 @@
 This script trains the model for a single seed.
 """
 import wandb
-from transformers import Trainer, EarlyStoppingCallback
+from transformers import Trainer, EarlyStoppingCallback, EvalPrediction
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+
 from tokenization.tokenizer import TextDataset
 from models.custom_model import CustomModel
-from metrics.compute_metrics import compute_metrics
+#from metrics.compute_metrics import compute_metrics
 from training_scripts.train_config import get_training_arguments
 from training_scripts.train_config import get_language
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+
 import numpy as np
 import torch
 import torch.cuda
@@ -17,6 +21,12 @@
 torch.backends.cudnn.allow_tf32 = True
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
 
+def compute_metrics(p: EvalPrediction):
+    preds = np.argmax(p.predictions, axis=1)
+    labels = p.label_ids
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', pos_label=1)
+    acc = accuracy_score(labels, preds)
+    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
 
 
 def run_training(train_dataset, eval_dataset, model_name, label_map, dataset_language, test_dataset=None,):
@@ -35,7 +45,7 @@ def run_training(train_dataset, eval_dataset, model_name, label_map, dataset_lan
 
     # Define training arguments
     training_arguments = TrainingArguments(
-        output_dir="./results",  # Directory to save model and tokenizer
+        output_dir=f"./results_{dataset_language}",  # Directory to save model and tokenizer
         evaluation_strategy="epoch",
         learning_rate=wandb.config.learning_rate,
         per_device_train_batch_size=wandb.config.batch_size,
@@ -61,6 +71,7 @@ def run_training(train_dataset, eval_dataset, model_name, label_map, dataset_lan
         callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
     )
 
+
     # Train the model
     trainer.train()
     # Evaluate the model on the test dataset
@@ -69,12 +80,7 @@ def run_training(train_dataset, eval_dataset, model_name, label_map, dataset_lan
 
     
-    # Evaluate the model
-    eval_results = trainer.evaluate()
-    
-    # Log evaluation and test results to W&B
-    wandb.log({"eval_results": eval_results})
-    wandb.log({"test_results": test_results})
+
 
     # Save model and tokenizer at the end of training
     model_path = f"{training_arguments.output_dir}/{run_name}_model_{dataset_language}"
@@ -83,6 +89,13 @@ def run_training(train_dataset, eval_dataset, model_name, label_map, dataset_lan
     hf_model.save_pretrained(model_path)
     hf_tokenizer.save_pretrained(tokenizer_path)
 
+    # Evaluate the model
+    eval_results = trainer.evaluate()
+    
+    # Log evaluation and test results to W&B
+    wandb.log({"eval_results": eval_results})
+    wandb.log({"test_results": test_results})
+
     # Ensure the W&B run is finished
     wandb.finish()
 

From 799b544e464b86556f2d7f7d8fa28c91bc63e2f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20R=C3=B8ysland=20Aarnes?=
 <peter@Peters-MacBook-Air-2.local>
Date: Mon, 6 May 2024 13:20:16 +0200
Subject: [PATCH 16/16] Tweaks

---
 checkthat/task1/sweep.yaml                   |  2 +-
 checkthat/task1/training_scripts/training.py | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/checkthat/task1/sweep.yaml b/checkthat/task1/sweep.yaml
index f4a5e5d..f3de7d0 100644
--- a/checkthat/task1/sweep.yaml
+++ b/checkthat/task1/sweep.yaml
@@ -20,7 +20,7 @@ parameters:
   hidden_dropout_prob:
     values: [0.1, 0.2, 0.3]  # Define discrete steps for grid search
   epochs:
-    values: [10, 20, 50]  # Convert range to discrete values
+    values: [50]  # Convert range to discrete values
   batch_size:
     values: [16, 32]
   learning_rate:
diff --git a/checkthat/task1/training_scripts/training.py b/checkthat/task1/training_scripts/training.py
index dd314bb..ee3ce84 100644
--- a/checkthat/task1/training_scripts/training.py
+++ b/checkthat/task1/training_scripts/training.py
@@ -43,24 +43,25 @@ def run_training(train_dataset, eval_dataset, model_name, label_map, dataset_lan
     hf_tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
-    # Define training arguments
     training_arguments = TrainingArguments(
-        output_dir=f"./results_{dataset_language}",  # Directory to save model and tokenizer
+        output_dir=f"./results_{dataset_language}",
         evaluation_strategy="epoch",
         learning_rate=wandb.config.learning_rate,
         per_device_train_batch_size=wandb.config.batch_size,
         num_train_epochs=wandb.config.epochs,
         logging_dir='./logs',
-        logging_steps=10,
+        logging_steps=100,
         do_train=True,
         do_eval=True,
         load_best_model_at_end=True,
-        save_strategy="epoch",  # Save model at the end of each epoch
-        save_total_limit=1,  # Optional: limits the total amount of checkpoints, deleting older
+        metric_for_best_model="f1",  # Here you specify the metric from your sweep config
+        greater_is_better=True,  # Since the goal is to maximize
+        save_strategy="epoch",
+        save_total_limit=1,
         report_to="wandb",
         run_name=run_name,
     )
-
+    
     # Create a Trainer instance
     trainer = Trainer(
         model=hf_model,