iai-group
diff --git a/‎.github/workflows/build.yaml
+2-2 b/‎.github/workflows/build.yaml
+2-2
diff --git a/‎.gitignore
+11 b/‎.gitignore
+11
diff --git a/‎config/moviebot_config_no_integration.yaml
+2-2 b/‎config/moviebot_config_no_integration.yaml
+2-2
diff --git a/‎data/training/utterances.yaml
+652 b/‎data/training/utterances.yaml
+652
diff --git a/‎moviebot/agent/agent.py
+3-4 b/‎moviebot/agent/agent.py
+3-4
diff --git a/‎moviebot/controller/controller_flask.py
+2-1 b/‎moviebot/controller/controller_flask.py
+2-1
diff --git a/‎moviebot/controller/controller_terminal.py
+2-1 b/‎moviebot/controller/controller_terminal.py
+2-1
diff --git a/‎moviebot/controller/http_data_formatter.py
+1-1 b/‎moviebot/controller/http_data_formatter.py
+1-1
diff --git a/‎moviebot/core/core_types.py
+7 b/‎moviebot/core/core_types.py
+7
diff --git a/‎moviebot/core/intents/__init__.py
+4 b/‎moviebot/core/intents/__init__.py
+4
diff --git a/‎moviebot/core/utterance/__init__.py
+3 b/‎moviebot/core/utterance/__init__.py
+3
diff --git a/‎moviebot/nlu/annotation/joint_bert/__init__.py
+3 b/‎moviebot/nlu/annotation/joint_bert/__init__.py
+3
diff --git a/‎moviebot/nlu/annotation/joint_bert/dataset.py
+193 b/‎moviebot/nlu/annotation/joint_bert/dataset.py
+193
@@ -15,7 +15,7 @@ jobs:
   pre-commit:
     if: always()
     runs-on: ubuntu-latest
-    timeout-minutes: 4
+    timeout-minutes: 30
     steps:
       - uses: actions/checkout@v3
         with:
@@ -43,7 +43,7 @@ jobs:
     name: "Build and Test Python 3.9"
     runs-on: ubuntu-latest
     if: always()
-    timeout-minutes: 20
+    timeout-minutes: 30
 
     steps:
       - uses: actions/checkout@v3
 
@@ -10,6 +10,17 @@ eval/data/extras/*
 # pytest coverage
 .coverage
 
+# pytorch
+lightning_logs/
+**/wandb/*
+
+# models
+models/
+**/checkpoints/*
+
 # Exportes and logs
 dialogue_export/
 data/users.db
+
+# local experimentation
+local/
@@ -19,8 +19,8 @@ TELEGRAM: False # execute the code on Telegram
 
 POLLING: False # True when using Telegram without server
 
-FLASK_SOCKET: False
-FLASK_REST: True
+FLASK_SOCKET: True
+FLASK_REST: False
 
 BOT_TOKEN_PATH: config/bot_token.yaml
 
 
@@ -1,25 +1,24 @@
 """Types of conversational agents are available here."""
 import logging
 import os
-from typing import Any, Dict, List, Union
+from typing import Any, Dict
 
 from dialoguekit.core import AnnotatedUtterance, Intent, Utterance
 from dialoguekit.participant import Agent, DialogueParticipant
 
+from moviebot.core.core_types import DialogueOptions
 from moviebot.core.intents.agent_intents import AgentIntents
 from moviebot.database.db_movies import DataBase
-from moviebot.dialogue_manager.dialogue_act import DialogueAct
 from moviebot.dialogue_manager.dialogue_manager import DialogueManager
 from moviebot.nlg.nlg import NLG
-from moviebot.nlu.nlu import NLU
+from moviebot.nlu.rule_based_nlu import RuleBasedNLU as NLU
 from moviebot.ontology.ontology import Ontology
 from moviebot.recommender.recommender_model import RecommenderModel
 from moviebot.recommender.slot_based_recommender_model import (
     SlotBasedRecommenderModel,
 )
 
 logger = logging.getLogger(__name__)
-DialogueOptions = Dict[DialogueAct, Union[str, List[str]]]
 
 
 def _get_ontology(ontology_path: str) -> Ontology:
 
@@ -7,8 +7,9 @@
 from dialoguekit.core import Utterance
 
 import moviebot.controller.http_data_formatter as http_formatter
-from moviebot.agent.agent import DialogueOptions, MovieBotAgent
+from moviebot.agent.agent import MovieBotAgent
 from moviebot.controller.controller import Controller
+from moviebot.core.core_types import DialogueOptions
 from moviebot.core.utterance.utterance import UserUtterance
 
 
 
@@ -10,8 +10,9 @@
 from dialoguekit.platforms import TerminalPlatform
 from questionary.constants import INDICATOR_SELECTED
 
-from moviebot.agent.agent import DialogueOptions, MovieBotAgent
+from moviebot.agent.agent import MovieBotAgent
 from moviebot.controller.controller import Controller
+from moviebot.core.core_types import DialogueOptions
 
 
 class ControllerTerminal(Controller, TerminalPlatform):
 
@@ -3,7 +3,7 @@
 from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, List, Tuple
 
-from moviebot.agent.agent import DialogueOptions
+from moviebot.core.core_types import DialogueOptions
 
 HTTP_OBJECT_MESSAGE = Dict[str, Dict[str, str]]
 
 
@@ -0,0 +1,7 @@
+"""Core types for the moviebot package."""
+from typing import TYPE_CHECKING, Dict, List, Union
+
+if TYPE_CHECKING:
+    from moviebot.dialogue_manager.dialogue_act import DialogueAct
+
+DialogueOptions = Dict["DialogueAct", Union[str, List[str]]]
@@ -0,0 +1,4 @@
+from .agent_intents import AgentIntents
+from .user_intents import UserIntents
+
+__all__ = ["UserIntents", "AgentIntents"]
@@ -0,0 +1,3 @@
+from .utterance import AgentUtterance, UserUtterance
+
+__all__ = ["UserUtterance", "AgentUtterance"]
@@ -0,0 +1,3 @@
+from .joint_bert import JointBERT
+
+__all__ = ["JointBERT"]
@@ -0,0 +1,193 @@
+"""Dataset loading for training and evaluating the JointBERT model. """
+import os
+import re
+from typing import Dict, Generator, List, Tuple
+
+import torch
+import yaml
+from torch.utils.data import Dataset
+from transformers import BertTokenizer
+
+from moviebot.nlu.annotation.joint_bert.slot_mapping import (
+    JointBERTIntent,
+    JointBERTSlot,
+)
+
+DataPoint = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+
+_IGNORE_INDEX = -100
+_TOKENIZER_PATH = "bert-base-uncased"
+
+
+def load_yaml(path: str) -> Dict[str, List[str]]:
+    """Loads the YAML file at the given path.
+
+    Args:
+        path: The path to the YAML file.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+
+    Returns:
+        The data in the YAML file.
+    """
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"File not found: {path}")
+
+    with open(path) as f:
+        return yaml.safe_load(f)
+
+
+def parse_data(
+    data: Dict[str, List[str]]
+) -> Generator[Tuple[str, str, List[str]], None, None]:
+    """Parses the input data to extract intent, text, and slot annotations.
+
+    Args:
+        data: The input data.
+
+    Yields:
+        A tuple of the intent, text, and slot annotations.
+    """
+    for intent in data.keys():
+        for annotated_example in data[intent]:
+            # Extract slot information
+            slot_annotations = re.findall(
+                r"\[(.*?)\]\((.*?)\)", annotated_example
+            )
+
+            # Remove slot annotations from the text
+            clean_text = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", annotated_example)
+
+            yield intent, clean_text, slot_annotations
+
+
+class JointBERTDataset(Dataset):
+    def __init__(self, path: str, max_length: int = 32) -> None:
+        """Initializes the dataset.
+
+        Args:
+            path: The path to the YAML file containing the data.
+            max_length: The maximum length of the input sequence. Defaults to
+                32.
+        """
+        self.data = load_yaml(path)
+        self.max_length = max_length
+
+        self.intent_label_count = len(JointBERTIntent)
+        self.slot_label_count = len(JointBERTSlot)
+
+        self.tokenizer = BertTokenizer.from_pretrained(_TOKENIZER_PATH)
+
+        self.examples = []
+        self._build_dataset()
+
+    def _build_dataset(self) -> None:
+        """Builds the dataset."""
+        for intent, clean_text, slot_annotations in parse_data(self.data):
+            intent, tokens, labels = self._tokenize_and_label(
+                intent, clean_text, slot_annotations
+            )
+
+            input_ids = self.tokenizer.encode(tokens, add_special_tokens=True)
+            attention_mask = [1] * len(input_ids)
+
+            # Add [CLS] and [SEP] tokens to labels
+            cls_label = _IGNORE_INDEX
+            sep_label = _IGNORE_INDEX
+            labels = [cls_label] + labels + [sep_label]
+
+            # Pad input_ids, attention_mask, and labels
+            padding_length = self.max_length - len(input_ids)
+            input_ids = input_ids + (
+                [self.tokenizer.pad_token_id] * padding_length
+            )
+            attention_mask = attention_mask + ([0] * padding_length)
+            labels = labels + ([_IGNORE_INDEX] * padding_length)
+            self.examples.append((input_ids, attention_mask, intent, labels))
+
+    def _num_word_tokens(self, word: str) -> int:
+        """Returns the number of word tokens in the input word.
+
+        Args:
+            word: The input word.
+
+        Returns:
+            The number of word tokens in the input word.
+        """
+        return len(self.tokenizer.tokenize(word))
+
+    def _tokenize_and_label(
+        self, intent: str, text: str, slot_annotations: Tuple(str, str)
+    ) -> Tuple[int, List[str], List[int]]:
+        """Tokenizes the text and assigns labels based on slot annotations.
+
+        The main purpose of this method is to convert the slot annotations into
+        labels that can be used to train the model. The labels need to have the
+        same length as the tokenized utterance.
+
+        For example:
+
+        Input: "I like scifi."
+        Tokens: ["I", "like", "sci", "##fi", "."]
+        Labels: ["OUT", "OUT", "B_GENRE", -100, "OUT"]
+        Indexes: [0, 0, 3, -100, 0]
+
+        Note that we put -100 to ignore evaluation of the loss function for
+        tokens that are not beginning of a slot. This makes it easier to
+        decode the labels later.
+
+        Args:
+            intent: The intent of the text.
+            text: The text to tokenize.
+            slot_annotations: A tuple of slot-value pairs in the text.
+
+        Returns:
+            A tuple of the intent, tokenized text, and labels.
+        """
+        tokens = self.tokenizer.tokenize(text)
+        labels = []
+
+        start_idx = 0
+        for slot_text, slot_label in slot_annotations:
+            index = text.find(slot_text)
+            for word in text[start_idx:index].split():
+                labels.append(JointBERTSlot.to_index("OUT"))
+                labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
+
+            for i, word in enumerate(slot_text.split()):
+                labels.append(
+                    JointBERTSlot.to_index(
+                        ("B_" if i == 0 else "I_") + slot_label.upper()
+                    )
+                )
+                labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
+            start_idx = index + len(slot_text)
+
+        for word in text[start_idx:].split():
+            labels.append(JointBERTSlot.to_index("OUT"))
+            labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
+        assert len(tokens) == len(labels)
+        return JointBERTIntent.to_index(intent.upper()), tokens, labels
+
+    def __len__(self):
+        """Returns the number of examples in the dataset."""
+        return len(self.examples)
+
+    def __getitem__(self, idx: int) -> DataPoint:
+        """Returns the example at the given index.
+
+        Args:
+            idx: The index of the example to return.
+
+        Returns:
+            A tuple of the input_ids, attention_mask, intent, and labels.
+        """
+        input_ids, attention_mask, intent, labels = self.examples[idx]
+
+        return (
+            torch.tensor(input_ids, dtype=torch.long),
+            torch.tensor(attention_mask, dtype=torch.long),
+            torch.tensor(intent, dtype=torch.long),
+            torch.tensor(labels, dtype=torch.long),
+        )
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .utterance import AgentUtterance, UserUtterance`
	`2`	`+`
	`3`	`+__all__ = ["UserUtterance", "AgentUtterance"]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .joint_bert import JointBERT`
	`2`	`+`
	`3`	`+__all__ = ["JointBERT"]`