From 8510dc950e9028c6c5858efdf6fc953145983263 Mon Sep 17 00:00:00 2001
From: ixaxaar <root@ixaxaar.in>
Date: Fri, 27 Oct 2023 18:38:58 +0530
Subject: [PATCH] refactor variable names

---
 geniusrise_huggingface/__init__.py            |   1 -
 geniusrise_huggingface/base/api.py            |  22 +-
 geniusrise_huggingface/base/bulk.py           |  12 +-
 geniusrise_huggingface/base/tests/test_api.py |  18 +-
 .../base/tests/test_bulk.py                   |  18 +-
 geniusrise_huggingface/embeddings/api.py      |  24 +-
 geniusrise_huggingface/embeddings/bulk.py     |  24 +-
 geniusrise_huggingface/sentiment_analysis.py  | 258 ------------------
 tests/test_sentiment_analysis.py              | 168 ------------
 9 files changed, 59 insertions(+), 486 deletions(-)
 delete mode 100644 geniusrise_huggingface/sentiment_analysis.py
 delete mode 100644 tests/test_sentiment_analysis.py

diff --git a/geniusrise_huggingface/__init__.py b/geniusrise_huggingface/__init__.py
index 1b3d7e7..fa99ea9 100644
--- a/geniusrise_huggingface/__init__.py
+++ b/geniusrise_huggingface/__init__.py
@@ -20,6 +20,5 @@
 from geniusrise_huggingface.language_model import HuggingFaceLanguageModelingFineTuner
 from geniusrise_huggingface.ner import HuggingFaceNamedEntityRecognitionFineTuner
 from geniusrise_huggingface.question_answering import HuggingFaceQuestionAnsweringFineTuner
-from geniusrise_huggingface.sentiment_analysis import HuggingFaceSentimentAnalysisFineTuner
 from geniusrise_huggingface.summarization import HuggingFaceSummarizationFineTuner
 from geniusrise_huggingface.translation import HuggingFaceTranslationFineTuner
diff --git a/geniusrise_huggingface/base/api.py b/geniusrise_huggingface/base/api.py
index a9c874c..9568c40 100644
--- a/geniusrise_huggingface/base/api.py
+++ b/geniusrise_huggingface/base/api.py
@@ -32,8 +32,8 @@ class HuggingFaceAPI(HuggingFaceBulk):
         model_revision (Optional[str]): The revision of the pre-trained language model.
         tokenizer_name (str): The name of the tokenizer used to preprocess input text.
         tokenizer_revision (Optional[str]): The revision of the tokenizer used to preprocess input text.
-        model_class_name (str): The name of the class of the pre-trained language model.
-        tokenizer_class_name (str): The name of the class of the tokenizer used to preprocess input text.
+        model_class (str): The name of the class of the pre-trained language model.
+        tokenizer_class (str): The name of the class of the tokenizer used to preprocess input text.
         use_cuda (bool): Whether to use a GPU for inference.
         quantization (int): The level of quantization to use for the pre-trained language model.
         precision (str): The precision to use for the pre-trained language model.
@@ -46,7 +46,7 @@ class HuggingFaceAPI(HuggingFaceBulk):
         text(**kwargs: Any) -> Dict[str, Any]:
             Generates text based on the given prompt and decoding strategy.
 
-        listen(model_name: str, model_class_name: str = "AutoModelForCausalLM", tokenizer_class_name: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, device_map: str | Dict | None = "auto", max_memory={0: "24GB"}, torchscript: bool = True, endpoint: str = "*", port: int = 3000, cors_domain: str = "http://localhost:3000", username: Optional[str] = None, password: Optional[str] = None, **model_args: Any) -> None:
+        listen(model_name: str, model_class: str = "AutoModelForCausalLM", tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, device_map: str | Dict | None = "auto", max_memory={0: "24GB"}, torchscript: bool = True, endpoint: str = "*", port: int = 3000, cors_domain: str = "http://localhost:3000", username: Optional[str] = None, password: Optional[str] = None, **model_args: Any) -> None:
             Starts a CherryPy server to listen for requests to generate text.
     """
 
@@ -118,8 +118,8 @@ def text(self, **kwargs: Any) -> Dict[str, Any]:
     def listen(
         self,
         model_name: str,
-        model_class_name: str = "AutoModelForCausalLM",
-        tokenizer_class_name: str = "AutoTokenizer",
+        model_class: str = "AutoModelForCausalLM",
+        tokenizer_class: str = "AutoTokenizer",
         use_cuda: bool = False,
         precision: str = "float16",
         quantization: int = 0,
@@ -138,8 +138,8 @@ def listen(
 
         Args:
             model_name (str): The name of the pre-trained language model.
-            model_class_name (str, optional): The name of the class of the pre-trained language model. Defaults to "AutoModelForCausalLM".
-            tokenizer_class_name (str, optional): The name of the class of the tokenizer used to preprocess input text. Defaults to "AutoTokenizer".
+            model_class (str, optional): The name of the class of the pre-trained language model. Defaults to "AutoModelForCausalLM".
+            tokenizer_class (str, optional): The name of the class of the tokenizer used to preprocess input text. Defaults to "AutoTokenizer".
             use_cuda (bool, optional): Whether to use a GPU for inference. Defaults to False.
             precision (str, optional): The precision to use for the pre-trained language model. Defaults to "float16".
             quantization (int, optional): The level of quantization to use for the pre-trained language model. Defaults to 0.
@@ -154,8 +154,8 @@ def listen(
             **model_args (Any): Additional arguments to pass to the pre-trained language model.
         """
         self.model_name = model_name
-        self.model_class_name = model_class_name
-        self.tokenizer_class_name = tokenizer_class_name
+        self.model_class = model_class
+        self.tokenizer_class = tokenizer_class
         self.use_cuda = use_cuda
         self.quantization = quantization
         self.precision = precision
@@ -182,8 +182,8 @@ def listen(
             tokenizer_name=self.tokenizer_name,
             model_revision=self.model_revision,
             tokenizer_revision=self.tokenizer_revision,
-            model_class_name=self.model_class_name,
-            tokenizer_class_name=self.tokenizer_class_name,
+            model_class=self.model_class,
+            tokenizer_class=self.tokenizer_class,
             use_cuda=self.use_cuda,
             precision=self.precision,
             quantization=self.quantization,
diff --git a/geniusrise_huggingface/base/bulk.py b/geniusrise_huggingface/base/bulk.py
index e660a4a..50b4509 100644
--- a/geniusrise_huggingface/base/bulk.py
+++ b/geniusrise_huggingface/base/bulk.py
@@ -205,8 +205,8 @@ def load_models(
         tokenizer_name: str,
         model_revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
-        model_class_name: str = "AutoModelForCausalLM",
-        tokenizer_class_name: str = "AutoTokenizer",
+        model_class: str = "AutoModelForCausalLM",
+        tokenizer_class: str = "AutoTokenizer",
         use_cuda: bool = False,
         precision: str = "float16",
         quantization: int = 0,
@@ -220,8 +220,8 @@ def load_models(
 
         Parameters:
         - model_name (str): The name of the model to load.
-        - model_class_name (str): The class name of the model to load. Default is "AutoModelForCausalLM".
-        - tokenizer_class_name (str): The class name of the tokenizer to load. Default is "AutoTokenizer".
+        - model_class (str): The class name of the model to load. Default is "AutoModelForCausalLM".
+        - tokenizer_class (str): The class name of the tokenizer to load. Default is "AutoTokenizer".
         - use_cuda (bool): Whether to use CUDA for GPU acceleration. Default is False.
         - precision (str): The bit precision for model and tokenizer. Options are 'float32', 'float16', 'bfloat16'. Default is 'float16'.
         - device_map (Union[str, Dict]): Device map for model placement. Default is "auto".
@@ -251,8 +251,8 @@ def load_models(
         if use_cuda and not device_map:
             device_map = "auto"
 
-        ModelClass = getattr(transformers, model_class_name)
-        TokenizerClass = getattr(transformers, tokenizer_class_name)
+        ModelClass = getattr(transformers, model_class)
+        TokenizerClass = getattr(transformers, tokenizer_class)
 
         # Load the model and tokenizer
         tokenizer = TokenizerClass.from_pretrained(tokenizer_name, revision=tokenizer_revision, torch_dtype=torch_dtype)
diff --git a/geniusrise_huggingface/base/tests/test_api.py b/geniusrise_huggingface/base/tests/test_api.py
index 303e818..74ab46d 100644
--- a/geniusrise_huggingface/base/tests/test_api.py
+++ b/geniusrise_huggingface/base/tests/test_api.py
@@ -25,7 +25,7 @@
 
 @pytest.fixture(
     params=[
-        # model_name, model_class_name, tokenizer_class_name, use_cuda, precision, quantization, device_map, max_memory, torchscript
+        # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript
         # fmt: off
         ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False),
         ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False),
@@ -72,8 +72,8 @@ def hfa():
 def test_load_models(hfa, model_config):
     (
         model_name,
-        model_class_name,
-        tokenizer_class_name,
+        model_class,
+        tokenizer_class,
         use_cuda,
         precision,
         quantization,
@@ -97,8 +97,8 @@ def test_load_models(hfa, model_config):
         model_revision=model_revision,
         tokenizer_name=model_name,
         tokenizer_revision=tokenizer_revision,
-        model_class_name=model_class_name,
-        tokenizer_class_name=tokenizer_class_name,
+        model_class=model_class,
+        tokenizer_class=tokenizer_class,
         use_cuda=use_cuda,
         precision=precision,
         quantization=quantization,
@@ -149,8 +149,8 @@ def test_load_models(hfa, model_config):
 def test_generate_strategies(hfa, model_config, strategy):
     (
         model_name,
-        model_class_name,
-        tokenizer_class_name,
+        model_class,
+        tokenizer_class,
         use_cuda,
         precision,
         quantization,
@@ -174,8 +174,8 @@ def test_generate_strategies(hfa, model_config, strategy):
         model_revision=model_revision,
         tokenizer_name=model_name,
         tokenizer_revision=tokenizer_revision,
-        model_class_name=model_class_name,
-        tokenizer_class_name=tokenizer_class_name,
+        model_class=model_class,
+        tokenizer_class=tokenizer_class,
         use_cuda=use_cuda,
         precision=precision,
         quantization=quantization,
diff --git a/geniusrise_huggingface/base/tests/test_bulk.py b/geniusrise_huggingface/base/tests/test_bulk.py
index 198e009..88f979f 100644
--- a/geniusrise_huggingface/base/tests/test_bulk.py
+++ b/geniusrise_huggingface/base/tests/test_bulk.py
@@ -25,7 +25,7 @@
 
 @pytest.fixture(
     params=[
-        # model_name, model_class_name, tokenizer_class_name, use_cuda, precision, quantization, device_map, max_memory, torchscript
+        # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript
         # fmt: off
         ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False),
         ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False),
@@ -72,8 +72,8 @@ def hfa():
 def test_load_models(hfa, model_config):
     (
         model_name,
-        model_class_name,
-        tokenizer_class_name,
+        model_class,
+        tokenizer_class,
         use_cuda,
         precision,
         quantization,
@@ -97,8 +97,8 @@ def test_load_models(hfa, model_config):
         model_revision=model_revision,
         tokenizer_name=model_name,
         tokenizer_revision=tokenizer_revision,
-        model_class_name=model_class_name,
-        tokenizer_class_name=tokenizer_class_name,
+        model_class=model_class,
+        tokenizer_class=tokenizer_class,
         use_cuda=use_cuda,
         precision=precision,
         quantization=quantization,
@@ -149,8 +149,8 @@ def test_load_models(hfa, model_config):
 def test_generate_strategies(hfa, model_config, strategy):
     (
         model_name,
-        model_class_name,
-        tokenizer_class_name,
+        model_class,
+        tokenizer_class,
         use_cuda,
         precision,
         quantization,
@@ -174,8 +174,8 @@ def test_generate_strategies(hfa, model_config, strategy):
         model_revision=model_revision,
         tokenizer_name=model_name,
         tokenizer_revision=tokenizer_revision,
-        model_class_name=model_class_name,
-        tokenizer_class_name=tokenizer_class_name,
+        model_class=model_class,
+        tokenizer_class=tokenizer_class,
         use_cuda=use_cuda,
         precision=precision,
         quantization=quantization,
diff --git a/geniusrise_huggingface/embeddings/api.py b/geniusrise_huggingface/embeddings/api.py
index 90e758b..3439d4c 100644
--- a/geniusrise_huggingface/embeddings/api.py
+++ b/geniusrise_huggingface/embeddings/api.py
@@ -45,8 +45,8 @@ class EmbeddingsAPI(HuggingFaceAPI):
     genius EmbeddingsAPI rise \
         listen \
             --model_name=bert-base-uncased \
-            --model_class_name=AutoModelForCausalLM \
-            --tokenizer_class_name=AutoTokenizer \
+            --model_class=AutoModelForCausalLM \
+            --tokenizer_class=AutoTokenizer \
             --sentence_transformer_model=paraphrase-MiniLM-L6-v2 \
             --use_cuda=True \
             --precision=float16 \
@@ -68,8 +68,8 @@ class EmbeddingsAPI(HuggingFaceAPI):
             method: "listen"
             args:
                 model_name: "bert-base-uncased"
-                model_class_name: "AutoModelForCausalLM"
-                tokenizer_class_name: "AutoTokenizer"
+                model_class: "AutoModelForCausalLM"
+                tokenizer_class: "AutoTokenizer"
                 sentence_transformer_model: "paraphrase-MiniLM-L6-v2"
                 use_cuda: True
                 precision: "float16"
@@ -234,8 +234,8 @@ def sentence_permutations(self, **kwargs: Any) -> Dict[str, Any]:
     def listen(  # type: ignore
         self,
         model_name: str,
-        model_class_name: str = "AutoModelForCausalLM",
-        tokenizer_class_name: str = "AutoTokenizer",
+        model_class: str = "AutoModelForCausalLM",
+        tokenizer_class: str = "AutoTokenizer",
         use_cuda: bool = False,
         precision: str = "float16",
         quantization: int = 0,
@@ -254,8 +254,8 @@ def listen(  # type: ignore
 
         Parameters:
         - model_name (str): The name of the Hugging Face model to use.
-        - model_class_name (str, optional): The class name of the model. Defaults to "AutoModelForCausalLM".
-        - tokenizer_class_name (str, optional): The class name of the tokenizer. Defaults to "AutoTokenizer".
+        - model_class (str, optional): The class name of the model. Defaults to "AutoModelForCausalLM".
+        - tokenizer_class (str, optional): The class name of the tokenizer. Defaults to "AutoTokenizer".
         - sentence_transformer_model (str, optional): The name of the Sentence Transformer model to use. Defaults to "paraphrase-MiniLM-L6-v2".
         - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False.
         - precision (str, optional): The precision to use for computations. Defaults to "float16".
@@ -273,8 +273,8 @@ def listen(  # type: ignore
         None
         """
         self.model_name = model_name
-        self.model_class_name = model_class_name
-        self.tokenizer_class_name = tokenizer_class_name
+        self.model_class = model_class
+        self.tokenizer_class = tokenizer_class
         self.use_cuda = use_cuda
         self.quantization = quantization
         self.precision = precision
@@ -302,8 +302,8 @@ def listen(  # type: ignore
             tokenizer_name=self.tokenizer_name,
             model_revision=self.model_revision,
             tokenizer_revision=self.tokenizer_revision,
-            model_class_name=self.model_class_name,
-            tokenizer_class_name=self.tokenizer_class_name,
+            model_class=self.model_class,
+            tokenizer_class=self.tokenizer_class,
             use_cuda=self.use_cuda,
             precision=self.precision,
             quantization=self.quantization,
diff --git a/geniusrise_huggingface/embeddings/bulk.py b/geniusrise_huggingface/embeddings/bulk.py
index 343d07e..afde639 100644
--- a/geniusrise_huggingface/embeddings/bulk.py
+++ b/geniusrise_huggingface/embeddings/bulk.py
@@ -102,8 +102,8 @@ def load_models(
         tokenizer_name: str,
         model_revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
-        model_class_name: str = "AutoModelForCausalLM",
-        tokenizer_class_name: str = "AutoTokenizer",
+        model_class: str = "AutoModelForCausalLM",
+        tokenizer_class: str = "AutoTokenizer",
         use_cuda: bool = False,
         precision: str = "float16",
         quantization: int = 0,
@@ -117,8 +117,8 @@ def load_models(
 
         Parameters:
         - model_name (str): The name of the model to load.
-        - model_class_name (str): The class name of the model to load. Default is "AutoModelForCausalLM".
-        - tokenizer_class_name (str): The class name of the tokenizer to load. Default is "AutoTokenizer".
+        - model_class (str): The class name of the model to load. Default is "AutoModelForCausalLM".
+        - tokenizer_class (str): The class name of the tokenizer to load. Default is "AutoTokenizer".
         - use_cuda (bool): Whether to use CUDA for GPU acceleration. Default is False.
         - precision (str): The bit precision for model and tokenizer. Options are 'float32', 'float16', 'bfloat16'. Default is 'float16'.
         - device_map (Union[str, Dict]): Device map for model placement. Default is "auto".
@@ -148,8 +148,8 @@ def load_models(
         if use_cuda and not device_map:
             device_map = "auto"
 
-        ModelClass = getattr(transformers, model_class_name)
-        TokenizerClass = getattr(transformers, tokenizer_class_name)
+        ModelClass = getattr(transformers, model_class)
+        TokenizerClass = getattr(transformers, tokenizer_class)
 
         # Load the model and tokenizer
         tokenizer = TokenizerClass.from_pretrained(tokenizer_name, revision=tokenizer_revision, torch_dtype=torch_dtype)
@@ -199,8 +199,8 @@ def generate(
         self,
         kind: str,
         model_name: str,
-        model_class_name: str = "AutoModelForCausalLM",
-        tokenizer_class_name: str = "AutoTokenizer",
+        model_class: str = "AutoModelForCausalLM",
+        tokenizer_class: str = "AutoTokenizer",
         sentence_transformer_model: str = "paraphrase-MiniLM-L6-v2",
         use_cuda: bool = False,
         precision: str = "float16",
@@ -220,8 +220,8 @@ def generate(
         This method reads text data from the specified input path, generates embeddings, and saves them to the specified output path.
         """
         self.model_name = model_name
-        self.model_class_name = model_class_name
-        self.tokenizer_class_name = tokenizer_class_name
+        self.model_class = model_class
+        self.tokenizer_class = tokenizer_class
         self.use_cuda = use_cuda
         self.quantization = quantization
         self.precision = precision
@@ -255,8 +255,8 @@ def generate(
                 tokenizer_name=self.tokenizer_name,
                 model_revision=self.model_revision,
                 tokenizer_revision=self.tokenizer_revision,
-                model_class_name=self.model_class_name,
-                tokenizer_class_name=self.tokenizer_class_name,
+                model_class=self.model_class,
+                tokenizer_class=self.tokenizer_class,
                 use_cuda=self.use_cuda,
                 precision=self.precision,
                 quantization=self.quantization,
diff --git a/geniusrise_huggingface/sentiment_analysis.py b/geniusrise_huggingface/sentiment_analysis.py
deleted file mode 100644
index 63af85e..0000000
--- a/geniusrise_huggingface/sentiment_analysis.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# 🧠 Geniusrise
-# Copyright (C) 2023  geniusrise.ai
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import sqlite3
-import xml.etree.ElementTree as ET
-from typing import Any, Dict, List, Union
-
-import pandas as pd
-import torch
-import yaml  # type: ignore
-from datasets import Dataset, DatasetDict, load_from_disk
-from pyarrow import feather
-from pyarrow import parquet as pq
-from transformers import DataCollatorWithPadding
-
-from geniusrise_huggingface.base import HuggingFaceFineTuner
-
-
-class HuggingFaceSentimentAnalysisFineTuner(HuggingFaceFineTuner):
-    r"""
-    A bolt for fine-tuning Hugging Face models on sentiment analysis tasks.
-
-    Args:
-        input (BatchInput): The batch input data.
-        output (OutputConfig): The output data.
-        state (State): The state manager.
-
-    CLI Usage:
-
-    ```bash
-        genius HuggingFaceSentimentAnalysisFineTuner rise \
-            batch \
-                --input_s3_bucket geniusrise-test \
-                --input_s3_folder train \
-            batch \
-                --output_s3_bucket geniusrise-test \
-                --output_s3_folder model \
-            fine_tune \
-                --args model_name=my_model tokenizer_name=my_tokenizer num_train_epochs=3 per_device_train_batch_size=8
-    ```
-
-    YAML Configuration:
-
-    ```yaml
-        version: "1"
-        bolts:
-            my_fine_tuner:
-                name: "HuggingFaceSentimentAnalysisFineTuner"
-                method: "fine_tune"
-                args:
-                    model_name: "my_model"
-                    tokenizer_name: "my_tokenizer"
-                    num_train_epochs: 3
-                    per_device_train_batch_size: 8
-                    data_max_length: 512
-                input:
-                    type: "batch"
-                    args:
-                        bucket: "my_bucket"
-                        folder: "my_dataset"
-                output:
-                    type: "batch"
-                    args:
-                        bucket: "my_bucket"
-                        folder: "my_model"
-                deploy:
-                    type: k8s
-                    args:
-                        kind: deployment
-                        name: my_fine_tuner
-                        context_name: arn:aws:eks:us-east-1:genius-dev:cluster/geniusrise-dev
-                        namespace: geniusrise
-                        image: geniusrise/geniusrise
-                        kube_config_path: ~/.kube/config
-    ```
-
-    Supported Data Formats:
-        - JSONL
-        - CSV
-        - Parquet
-        - JSON
-        - XML
-        - YAML
-        - TSV
-        - Excel (.xls, .xlsx)
-        - SQLite (.db)
-        - Feather
-    """
-
-    def load_dataset(self, dataset_path: str, **kwargs: Any) -> Dataset | DatasetDict:
-        r"""
-        Load a dataset from a directory.
-
-        Args:
-            dataset_path (str): The path to the dataset directory.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            Dataset | DatasetDict: The loaded dataset.
-
-        ## Supported Data Formats and Structures:
-
-        ### JSONL
-        Each line is a JSON object representing an example.
-        ```json
-        {"text": "The text content", "label": "The label"}
-        ```
-
-        ### CSV
-        Should contain 'text' and 'label' columns.
-        ```csv
-        text,label
-        "The text content","The label"
-        ```
-
-        ### Parquet
-        Should contain 'text' and 'label' columns.
-
-        ### JSON
-        An array of dictionaries with 'text' and 'label' keys.
-        ```json
-        [{"text": "The text content", "label": "The label"}]
-        ```
-
-        ### XML
-        Each 'record' element should contain 'text' and 'label' child elements.
-        ```xml
-        <record>
-            <text>The text content</text>
-            <label>The label</label>
-        </record>
-        ```
-
-        ### YAML
-        Each document should be a dictionary with 'text' and 'label' keys.
-        ```yaml
-        - text: "The text content"
-          label: "The label"
-        ```
-
-        ### TSV
-        Should contain 'text' and 'label' columns separated by tabs.
-
-        ### Excel (.xls, .xlsx)
-        Should contain 'text' and 'label' columns.
-
-        ### SQLite (.db)
-        Should contain a table with 'text' and 'label' columns.
-
-        ### Feather
-        Should contain 'text' and 'label' columns.
-        """
-        if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")):
-            dataset = load_from_disk(dataset_path)
-        else:
-            data = []
-            for filename in os.listdir(dataset_path):
-                filepath = os.path.join(dataset_path, filename)
-                if filename.endswith(".jsonl"):
-                    with open(filepath, "r") as f:
-                        for line in f:
-                            example = json.loads(line)
-                            data.append(example)
-                elif filename.endswith(".csv"):
-                    df = pd.read_csv(filepath)
-                    data.extend(df.to_dict("records"))
-                elif filename.endswith(".parquet"):
-                    df = pq.read_table(filepath).to_pandas()
-                    data.extend(df.to_dict("records"))
-                elif filename.endswith(".json"):
-                    with open(filepath, "r") as f:
-                        json_data = json.load(f)
-                        data.extend(json_data)
-                elif filename.endswith(".xml"):
-                    tree = ET.parse(filepath)
-                    root = tree.getroot()
-                    for record in root.findall("record"):
-                        text = record.find("text").text  # type: ignore
-                        label = record.find("label").text  # type: ignore
-                        data.append({"text": text, "label": label})
-                elif filename.endswith(".yaml") or filename.endswith(".yml"):
-                    with open(filepath, "r") as f:
-                        yaml_data = yaml.safe_load(f)
-                        data.extend(yaml_data)
-                elif filename.endswith(".tsv"):
-                    df = pd.read_csv(filepath, sep="\t")
-                    data.extend(df.to_dict("records"))
-                elif filename.endswith((".xls", ".xlsx")):
-                    df = pd.read_excel(filepath)
-                    data.extend(df.to_dict("records"))
-                elif filename.endswith(".db"):
-                    conn = sqlite3.connect(filepath)
-                    query = "SELECT text, label FROM dataset_table;"
-                    df = pd.read_sql_query(query, conn)
-                    data.extend(df.to_dict("records"))
-                elif filename.endswith(".feather"):
-                    df = feather.read_feather(filepath)
-                    data.extend(df.to_dict("records"))
-
-            if self.data_extractor_lambda:
-                fn = eval(self.data_extractor_lambda)
-                data = [fn(d) for d in data]
-            else:
-                data = data
-
-            dataset = Dataset.from_pandas(pd.DataFrame(data))
-
-        tokenized_dataset = dataset.map(
-            self.prepare_train_features,
-            batched=True,
-            remove_columns=dataset.column_names,
-        )
-        return tokenized_dataset
-
-    def prepare_train_features(self, examples: Dict[str, Union[str, int]]) -> Dict[str, Union[List[int], int]]:
-        """
-        Tokenize the examples and prepare the features for training.
-
-        Args:
-            examples (Dict[str, Union[str, int]]): A dictionary of examples.
-
-        Returns:
-            Dict[str, Union[List[int], int]]: The processed features.
-        """
-        if not self.tokenizer:
-            raise Exception("No tokenizer found, please call load_models first.")
-
-        tokenized_inputs = self.tokenizer(examples["text"], truncation=True, padding=False)
-        tokenized_inputs["labels"] = examples["label"]
-        return tokenized_inputs
-
-    def data_collator(
-        self, examples: List[Dict[str, Union[List[int], int]]]
-    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
-        """
-        Customize the data collator.
-
-        Args:
-            examples (List[Dict[str, Union[List[int], int]]]): The examples to collate.
-
-        Returns:
-            Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: The collated data.
-        """
-        return DataCollatorWithPadding(self.tokenizer)(examples)
diff --git a/tests/test_sentiment_analysis.py b/tests/test_sentiment_analysis.py
deleted file mode 100644
index 4308a1a..0000000
--- a/tests/test_sentiment_analysis.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# 🧠 Geniusrise
-# Copyright (C) 2023  geniusrise.ai
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import sqlite3
-import tempfile
-import xml.etree.ElementTree as ET
-
-import numpy as np
-import pandas as pd
-import pytest
-import yaml  # type: ignore
-from datasets import Dataset
-from geniusrise.core import BatchInput, BatchOutput, InMemoryState
-from pyarrow import feather
-from pyarrow import parquet as pq
-from transformers import EvalPrediction
-
-from geniusrise_huggingface.sentiment_analysis import HuggingFaceSentimentAnalysisFineTuner
-
-
-# Helper function to create synthetic data in different formats
-def create_dataset_in_format(directory, ext):
-    os.makedirs(directory, exist_ok=True)
-    data = [{"text": f"text_{i}", "label": i % 2} for i in range(10)]
-    df = pd.DataFrame(data)
-
-    if ext == "huggingface":
-        dataset = Dataset.from_pandas(df)
-        dataset.save_to_disk(directory)
-    elif ext == "csv":
-        df.to_csv(os.path.join(directory, "data.csv"), index=False)
-    elif ext == "jsonl":
-        with open(os.path.join(directory, "data.jsonl"), "w") as f:
-            for item in data:
-                f.write(json.dumps(item) + "\n")
-    elif ext == "parquet":
-        pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet"))
-    elif ext == "json":
-        with open(os.path.join(directory, "data.json"), "w") as f:
-            json.dump(data, f)
-    elif ext == "xml":
-        root = ET.Element("root")
-        for item in data:
-            record = ET.SubElement(root, "record")
-            ET.SubElement(record, "text").text = item["text"]
-            ET.SubElement(record, "label").text = str(item["label"])
-        tree = ET.ElementTree(root)
-        tree.write(os.path.join(directory, "data.xml"))
-    elif ext == "yaml":
-        with open(os.path.join(directory, "data.yaml"), "w") as f:
-            yaml.dump(data, f)
-    elif ext == "tsv":
-        df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t")
-    elif ext == "xlsx":
-        df.to_excel(os.path.join(directory, "data.xlsx"), index=False)
-    elif ext == "db":
-        conn = sqlite3.connect(os.path.join(directory, "data.db"))
-        df.to_sql("dataset_table", conn, if_exists="replace", index=False)
-        conn.close()
-    elif ext == "feather":
-        feather.write_feather(df, os.path.join(directory, "data.feather"))
-
-
-# Fixtures for each file type
-@pytest.fixture(
-    params=[
-        "huggingface",
-        "csv",
-        "json",
-        "jsonl",
-        "parquet",
-        "xml",
-        "yaml",
-        "tsv",
-        "xlsx",
-        "db",
-        "feather",
-    ]
-)
-def dataset_file(request, tmpdir):
-    ext = request.param
-    create_dataset_in_format(tmpdir + "/train", ext)
-    create_dataset_in_format(tmpdir + "/eval", ext)
-    return tmpdir, ext
-
-
-@pytest.fixture
-def sentiment_bolt():
-    input_dir = tempfile.mkdtemp()
-    output_dir = tempfile.mkdtemp()
-    input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input")
-    output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output")
-    state = InMemoryState()
-    klass = HuggingFaceSentimentAnalysisFineTuner(
-        input=input,
-        output=output,
-        state=state,
-    )
-    klass.model_name = "bert-base-uncased"
-    klass.tokenizer_name = "bert-base-uncased"
-    klass.model_class = "BertForSequenceClassification"
-    klass.tokenizer_class = "BertTokenizer"
-    return klass
-
-
-def test_sentiment_bolt_init(sentiment_bolt):
-    sentiment_bolt.load_models()
-    assert sentiment_bolt.model is not None
-    assert sentiment_bolt.tokenizer is not None
-    assert sentiment_bolt.input is not None
-    assert sentiment_bolt.output is not None
-    assert sentiment_bolt.state is not None
-
-
-def test_load_dataset_all_formats(sentiment_bolt, dataset_file):
-    tmpdir, ext = dataset_file
-    dataset_path = os.path.join(tmpdir, "train")
-    sentiment_bolt.load_models()
-    dataset = sentiment_bolt.load_dataset(dataset_path)
-    assert dataset is not None
-    assert len(dataset) == 10
-
-
-def test_sentiment_bolt_fine_tune(sentiment_bolt, dataset_file):
-    tmpdir, ext = dataset_file
-    sentiment_bolt.input.input_folder = tmpdir
-
-    sentiment_bolt.fine_tune(
-        model_name="bert-base-uncased",
-        tokenizer_name="bert-base-uncased",
-        num_train_epochs=1,
-        per_device_train_batch_size=1,
-        model_class="BertForSequenceClassification",
-        tokenizer_class="BertTokenizer",
-        evaluate=True,
-    )
-
-    output_dir = sentiment_bolt.output.output_folder
-    assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin"))
-    assert os.path.isfile(os.path.join(output_dir + "/model", "config.json"))
-    assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin"))
-
-
-def test_sentiment_bolt_compute_metrics(sentiment_bolt):
-    sentiment_bolt.load_models()
-
-    logits = np.array([[0.6, 0.4], [0.4, 0.6]])
-    labels = np.array([0, 1])
-    eval_pred = EvalPrediction(predictions=logits, label_ids=labels)
-    metrics = sentiment_bolt.compute_metrics(eval_pred)
-    assert "accuracy" in metrics
-    assert "precision" in metrics
-    assert "recall" in metrics
-    assert "f1" in metrics