From 8510dc950e9028c6c5858efdf6fc953145983263 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Fri, 27 Oct 2023 18:38:58 +0530 Subject: [PATCH] refactor variable names --- geniusrise_huggingface/__init__.py | 1 - geniusrise_huggingface/base/api.py | 22 +- geniusrise_huggingface/base/bulk.py | 12 +- geniusrise_huggingface/base/tests/test_api.py | 18 +- .../base/tests/test_bulk.py | 18 +- geniusrise_huggingface/embeddings/api.py | 24 +- geniusrise_huggingface/embeddings/bulk.py | 24 +- geniusrise_huggingface/sentiment_analysis.py | 258 ------------------ tests/test_sentiment_analysis.py | 168 ------------ 9 files changed, 59 insertions(+), 486 deletions(-) delete mode 100644 geniusrise_huggingface/sentiment_analysis.py delete mode 100644 tests/test_sentiment_analysis.py diff --git a/geniusrise_huggingface/__init__.py b/geniusrise_huggingface/__init__.py index 1b3d7e7..fa99ea9 100644 --- a/geniusrise_huggingface/__init__.py +++ b/geniusrise_huggingface/__init__.py @@ -20,6 +20,5 @@ from geniusrise_huggingface.language_model import HuggingFaceLanguageModelingFineTuner from geniusrise_huggingface.ner import HuggingFaceNamedEntityRecognitionFineTuner from geniusrise_huggingface.question_answering import HuggingFaceQuestionAnsweringFineTuner -from geniusrise_huggingface.sentiment_analysis import HuggingFaceSentimentAnalysisFineTuner from geniusrise_huggingface.summarization import HuggingFaceSummarizationFineTuner from geniusrise_huggingface.translation import HuggingFaceTranslationFineTuner diff --git a/geniusrise_huggingface/base/api.py b/geniusrise_huggingface/base/api.py index a9c874c..9568c40 100644 --- a/geniusrise_huggingface/base/api.py +++ b/geniusrise_huggingface/base/api.py @@ -32,8 +32,8 @@ class HuggingFaceAPI(HuggingFaceBulk): model_revision (Optional[str]): The revision of the pre-trained language model. tokenizer_name (str): The name of the tokenizer used to preprocess input text. tokenizer_revision (Optional[str]): The revision of the tokenizer used to preprocess input text. - model_class_name (str): The name of the class of the pre-trained language model. - tokenizer_class_name (str): The name of the class of the tokenizer used to preprocess input text. + model_class (str): The name of the class of the pre-trained language model. + tokenizer_class (str): The name of the class of the tokenizer used to preprocess input text. use_cuda (bool): Whether to use a GPU for inference. quantization (int): The level of quantization to use for the pre-trained language model. precision (str): The precision to use for the pre-trained language model. @@ -46,7 +46,7 @@ class HuggingFaceAPI(HuggingFaceBulk): text(**kwargs: Any) -> Dict[str, Any]: Generates text based on the given prompt and decoding strategy. - listen(model_name: str, model_class_name: str = "AutoModelForCausalLM", tokenizer_class_name: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, device_map: str | Dict | None = "auto", max_memory={0: "24GB"}, torchscript: bool = True, endpoint: str = "*", port: int = 3000, cors_domain: str = "http://localhost:3000", username: Optional[str] = None, password: Optional[str] = None, **model_args: Any) -> None: + listen(model_name: str, model_class: str = "AutoModelForCausalLM", tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, device_map: str | Dict | None = "auto", max_memory={0: "24GB"}, torchscript: bool = True, endpoint: str = "*", port: int = 3000, cors_domain: str = "http://localhost:3000", username: Optional[str] = None, password: Optional[str] = None, **model_args: Any) -> None: Starts a CherryPy server to listen for requests to generate text. """ @@ -118,8 +118,8 @@ def text(self, **kwargs: Any) -> Dict[str, Any]: def listen( self, model_name: str, - model_class_name: str = "AutoModelForCausalLM", - tokenizer_class_name: str = "AutoTokenizer", + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, @@ -138,8 +138,8 @@ def listen( Args: model_name (str): The name of the pre-trained language model. - model_class_name (str, optional): The name of the class of the pre-trained language model. Defaults to "AutoModelForCausalLM". - tokenizer_class_name (str, optional): The name of the class of the tokenizer used to preprocess input text. Defaults to "AutoTokenizer". + model_class (str, optional): The name of the class of the pre-trained language model. Defaults to "AutoModelForCausalLM". + tokenizer_class (str, optional): The name of the class of the tokenizer used to preprocess input text. Defaults to "AutoTokenizer". use_cuda (bool, optional): Whether to use a GPU for inference. Defaults to False. precision (str, optional): The precision to use for the pre-trained language model. Defaults to "float16". quantization (int, optional): The level of quantization to use for the pre-trained language model. Defaults to 0. @@ -154,8 +154,8 @@ def listen( **model_args (Any): Additional arguments to pass to the pre-trained language model. """ self.model_name = model_name - self.model_class_name = model_class_name - self.tokenizer_class_name = tokenizer_class_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class self.use_cuda = use_cuda self.quantization = quantization self.precision = precision @@ -182,8 +182,8 @@ def listen( tokenizer_name=self.tokenizer_name, model_revision=self.model_revision, tokenizer_revision=self.tokenizer_revision, - model_class_name=self.model_class_name, - tokenizer_class_name=self.tokenizer_class_name, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, use_cuda=self.use_cuda, precision=self.precision, quantization=self.quantization, diff --git a/geniusrise_huggingface/base/bulk.py b/geniusrise_huggingface/base/bulk.py index e660a4a..50b4509 100644 --- a/geniusrise_huggingface/base/bulk.py +++ b/geniusrise_huggingface/base/bulk.py @@ -205,8 +205,8 @@ def load_models( tokenizer_name: str, model_revision: Optional[str] = None, tokenizer_revision: Optional[str] = None, - model_class_name: str = "AutoModelForCausalLM", - tokenizer_class_name: str = "AutoTokenizer", + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, @@ -220,8 +220,8 @@ def load_models( Parameters: - model_name (str): The name of the model to load. - - model_class_name (str): The class name of the model to load. Default is "AutoModelForCausalLM". - - tokenizer_class_name (str): The class name of the tokenizer to load. Default is "AutoTokenizer". + - model_class (str): The class name of the model to load. Default is "AutoModelForCausalLM". + - tokenizer_class (str): The class name of the tokenizer to load. Default is "AutoTokenizer". - use_cuda (bool): Whether to use CUDA for GPU acceleration. Default is False. - precision (str): The bit precision for model and tokenizer. Options are 'float32', 'float16', 'bfloat16'. Default is 'float16'. - device_map (Union[str, Dict]): Device map for model placement. Default is "auto". @@ -251,8 +251,8 @@ def load_models( if use_cuda and not device_map: device_map = "auto" - ModelClass = getattr(transformers, model_class_name) - TokenizerClass = getattr(transformers, tokenizer_class_name) + ModelClass = getattr(transformers, model_class) + TokenizerClass = getattr(transformers, tokenizer_class) # Load the model and tokenizer tokenizer = TokenizerClass.from_pretrained(tokenizer_name, revision=tokenizer_revision, torch_dtype=torch_dtype) diff --git a/geniusrise_huggingface/base/tests/test_api.py b/geniusrise_huggingface/base/tests/test_api.py index 303e818..74ab46d 100644 --- a/geniusrise_huggingface/base/tests/test_api.py +++ b/geniusrise_huggingface/base/tests/test_api.py @@ -25,7 +25,7 @@ @pytest.fixture( params=[ - # model_name, model_class_name, tokenizer_class_name, use_cuda, precision, quantization, device_map, max_memory, torchscript + # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript # fmt: off ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False), ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False), @@ -72,8 +72,8 @@ def hfa(): def test_load_models(hfa, model_config): ( model_name, - model_class_name, - tokenizer_class_name, + model_class, + tokenizer_class, use_cuda, precision, quantization, @@ -97,8 +97,8 @@ def test_load_models(hfa, model_config): model_revision=model_revision, tokenizer_name=model_name, tokenizer_revision=tokenizer_revision, - model_class_name=model_class_name, - tokenizer_class_name=tokenizer_class_name, + model_class=model_class, + tokenizer_class=tokenizer_class, use_cuda=use_cuda, precision=precision, quantization=quantization, @@ -149,8 +149,8 @@ def test_load_models(hfa, model_config): def test_generate_strategies(hfa, model_config, strategy): ( model_name, - model_class_name, - tokenizer_class_name, + model_class, + tokenizer_class, use_cuda, precision, quantization, @@ -174,8 +174,8 @@ def test_generate_strategies(hfa, model_config, strategy): model_revision=model_revision, tokenizer_name=model_name, tokenizer_revision=tokenizer_revision, - model_class_name=model_class_name, - tokenizer_class_name=tokenizer_class_name, + model_class=model_class, + tokenizer_class=tokenizer_class, use_cuda=use_cuda, precision=precision, quantization=quantization, diff --git a/geniusrise_huggingface/base/tests/test_bulk.py b/geniusrise_huggingface/base/tests/test_bulk.py index 198e009..88f979f 100644 --- a/geniusrise_huggingface/base/tests/test_bulk.py +++ b/geniusrise_huggingface/base/tests/test_bulk.py @@ -25,7 +25,7 @@ @pytest.fixture( params=[ - # model_name, model_class_name, tokenizer_class_name, use_cuda, precision, quantization, device_map, max_memory, torchscript + # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript # fmt: off ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False), ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False), @@ -72,8 +72,8 @@ def hfa(): def test_load_models(hfa, model_config): ( model_name, - model_class_name, - tokenizer_class_name, + model_class, + tokenizer_class, use_cuda, precision, quantization, @@ -97,8 +97,8 @@ def test_load_models(hfa, model_config): model_revision=model_revision, tokenizer_name=model_name, tokenizer_revision=tokenizer_revision, - model_class_name=model_class_name, - tokenizer_class_name=tokenizer_class_name, + model_class=model_class, + tokenizer_class=tokenizer_class, use_cuda=use_cuda, precision=precision, quantization=quantization, @@ -149,8 +149,8 @@ def test_load_models(hfa, model_config): def test_generate_strategies(hfa, model_config, strategy): ( model_name, - model_class_name, - tokenizer_class_name, + model_class, + tokenizer_class, use_cuda, precision, quantization, @@ -174,8 +174,8 @@ def test_generate_strategies(hfa, model_config, strategy): model_revision=model_revision, tokenizer_name=model_name, tokenizer_revision=tokenizer_revision, - model_class_name=model_class_name, - tokenizer_class_name=tokenizer_class_name, + model_class=model_class, + tokenizer_class=tokenizer_class, use_cuda=use_cuda, precision=precision, quantization=quantization, diff --git a/geniusrise_huggingface/embeddings/api.py b/geniusrise_huggingface/embeddings/api.py index 90e758b..3439d4c 100644 --- a/geniusrise_huggingface/embeddings/api.py +++ b/geniusrise_huggingface/embeddings/api.py @@ -45,8 +45,8 @@ class EmbeddingsAPI(HuggingFaceAPI): genius EmbeddingsAPI rise \ listen \ --model_name=bert-base-uncased \ - --model_class_name=AutoModelForCausalLM \ - --tokenizer_class_name=AutoTokenizer \ + --model_class=AutoModelForCausalLM \ + --tokenizer_class=AutoTokenizer \ --sentence_transformer_model=paraphrase-MiniLM-L6-v2 \ --use_cuda=True \ --precision=float16 \ @@ -68,8 +68,8 @@ class EmbeddingsAPI(HuggingFaceAPI): method: "listen" args: model_name: "bert-base-uncased" - model_class_name: "AutoModelForCausalLM" - tokenizer_class_name: "AutoTokenizer" + model_class: "AutoModelForCausalLM" + tokenizer_class: "AutoTokenizer" sentence_transformer_model: "paraphrase-MiniLM-L6-v2" use_cuda: True precision: "float16" @@ -234,8 +234,8 @@ def sentence_permutations(self, **kwargs: Any) -> Dict[str, Any]: def listen( # type: ignore self, model_name: str, - model_class_name: str = "AutoModelForCausalLM", - tokenizer_class_name: str = "AutoTokenizer", + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, @@ -254,8 +254,8 @@ def listen( # type: ignore Parameters: - model_name (str): The name of the Hugging Face model to use. - - model_class_name (str, optional): The class name of the model. Defaults to "AutoModelForCausalLM". - - tokenizer_class_name (str, optional): The class name of the tokenizer. Defaults to "AutoTokenizer". + - model_class (str, optional): The class name of the model. Defaults to "AutoModelForCausalLM". + - tokenizer_class (str, optional): The class name of the tokenizer. Defaults to "AutoTokenizer". - sentence_transformer_model (str, optional): The name of the Sentence Transformer model to use. Defaults to "paraphrase-MiniLM-L6-v2". - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False. - precision (str, optional): The precision to use for computations. Defaults to "float16". @@ -273,8 +273,8 @@ def listen( # type: ignore None """ self.model_name = model_name - self.model_class_name = model_class_name - self.tokenizer_class_name = tokenizer_class_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class self.use_cuda = use_cuda self.quantization = quantization self.precision = precision @@ -302,8 +302,8 @@ def listen( # type: ignore tokenizer_name=self.tokenizer_name, model_revision=self.model_revision, tokenizer_revision=self.tokenizer_revision, - model_class_name=self.model_class_name, - tokenizer_class_name=self.tokenizer_class_name, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, use_cuda=self.use_cuda, precision=self.precision, quantization=self.quantization, diff --git a/geniusrise_huggingface/embeddings/bulk.py b/geniusrise_huggingface/embeddings/bulk.py index 343d07e..afde639 100644 --- a/geniusrise_huggingface/embeddings/bulk.py +++ b/geniusrise_huggingface/embeddings/bulk.py @@ -102,8 +102,8 @@ def load_models( tokenizer_name: str, model_revision: Optional[str] = None, tokenizer_revision: Optional[str] = None, - model_class_name: str = "AutoModelForCausalLM", - tokenizer_class_name: str = "AutoTokenizer", + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, @@ -117,8 +117,8 @@ def load_models( Parameters: - model_name (str): The name of the model to load. - - model_class_name (str): The class name of the model to load. Default is "AutoModelForCausalLM". - - tokenizer_class_name (str): The class name of the tokenizer to load. Default is "AutoTokenizer". + - model_class (str): The class name of the model to load. Default is "AutoModelForCausalLM". + - tokenizer_class (str): The class name of the tokenizer to load. Default is "AutoTokenizer". - use_cuda (bool): Whether to use CUDA for GPU acceleration. Default is False. - precision (str): The bit precision for model and tokenizer. Options are 'float32', 'float16', 'bfloat16'. Default is 'float16'. - device_map (Union[str, Dict]): Device map for model placement. Default is "auto". @@ -148,8 +148,8 @@ def load_models( if use_cuda and not device_map: device_map = "auto" - ModelClass = getattr(transformers, model_class_name) - TokenizerClass = getattr(transformers, tokenizer_class_name) + ModelClass = getattr(transformers, model_class) + TokenizerClass = getattr(transformers, tokenizer_class) # Load the model and tokenizer tokenizer = TokenizerClass.from_pretrained(tokenizer_name, revision=tokenizer_revision, torch_dtype=torch_dtype) @@ -199,8 +199,8 @@ def generate( self, kind: str, model_name: str, - model_class_name: str = "AutoModelForCausalLM", - tokenizer_class_name: str = "AutoTokenizer", + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", sentence_transformer_model: str = "paraphrase-MiniLM-L6-v2", use_cuda: bool = False, precision: str = "float16", @@ -220,8 +220,8 @@ def generate( This method reads text data from the specified input path, generates embeddings, and saves them to the specified output path. """ self.model_name = model_name - self.model_class_name = model_class_name - self.tokenizer_class_name = tokenizer_class_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class self.use_cuda = use_cuda self.quantization = quantization self.precision = precision @@ -255,8 +255,8 @@ def generate( tokenizer_name=self.tokenizer_name, model_revision=self.model_revision, tokenizer_revision=self.tokenizer_revision, - model_class_name=self.model_class_name, - tokenizer_class_name=self.tokenizer_class_name, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, use_cuda=self.use_cuda, precision=self.precision, quantization=self.quantization, diff --git a/geniusrise_huggingface/sentiment_analysis.py b/geniusrise_huggingface/sentiment_analysis.py deleted file mode 100644 index 63af85e..0000000 --- a/geniusrise_huggingface/sentiment_analysis.py +++ /dev/null @@ -1,258 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import sqlite3 -import xml.etree.ElementTree as ET -from typing import Any, Dict, List, Union - -import pandas as pd -import torch -import yaml # type: ignore -from datasets import Dataset, DatasetDict, load_from_disk -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import DataCollatorWithPadding - -from geniusrise_huggingface.base import HuggingFaceFineTuner - - -class HuggingFaceSentimentAnalysisFineTuner(HuggingFaceFineTuner): - r""" - A bolt for fine-tuning Hugging Face models on sentiment analysis tasks. - - Args: - input (BatchInput): The batch input data. - output (OutputConfig): The output data. - state (State): The state manager. - - CLI Usage: - - ```bash - genius HuggingFaceSentimentAnalysisFineTuner rise \ - batch \ - --input_s3_bucket geniusrise-test \ - --input_s3_folder train \ - batch \ - --output_s3_bucket geniusrise-test \ - --output_s3_folder model \ - fine_tune \ - --args model_name=my_model tokenizer_name=my_tokenizer num_train_epochs=3 per_device_train_batch_size=8 - ``` - - YAML Configuration: - - ```yaml - version: "1" - bolts: - my_fine_tuner: - name: "HuggingFaceSentimentAnalysisFineTuner" - method: "fine_tune" - args: - model_name: "my_model" - tokenizer_name: "my_tokenizer" - num_train_epochs: 3 - per_device_train_batch_size: 8 - data_max_length: 512 - input: - type: "batch" - args: - bucket: "my_bucket" - folder: "my_dataset" - output: - type: "batch" - args: - bucket: "my_bucket" - folder: "my_model" - deploy: - type: k8s - args: - kind: deployment - name: my_fine_tuner - context_name: arn:aws:eks:us-east-1:genius-dev:cluster/geniusrise-dev - namespace: geniusrise - image: geniusrise/geniusrise - kube_config_path: ~/.kube/config - ``` - - Supported Data Formats: - - JSONL - - CSV - - Parquet - - JSON - - XML - - YAML - - TSV - - Excel (.xls, .xlsx) - - SQLite (.db) - - Feather - """ - - def load_dataset(self, dataset_path: str, **kwargs: Any) -> Dataset | DatasetDict: - r""" - Load a dataset from a directory. - - Args: - dataset_path (str): The path to the dataset directory. - **kwargs: Additional keyword arguments. - - Returns: - Dataset | DatasetDict: The loaded dataset. - - ## Supported Data Formats and Structures: - - ### JSONL - Each line is a JSON object representing an example. - ```json - {"text": "The text content", "label": "The label"} - ``` - - ### CSV - Should contain 'text' and 'label' columns. - ```csv - text,label - "The text content","The label" - ``` - - ### Parquet - Should contain 'text' and 'label' columns. - - ### JSON - An array of dictionaries with 'text' and 'label' keys. - ```json - [{"text": "The text content", "label": "The label"}] - ``` - - ### XML - Each 'record' element should contain 'text' and 'label' child elements. - ```xml - - The text content - - - ``` - - ### YAML - Each document should be a dictionary with 'text' and 'label' keys. - ```yaml - - text: "The text content" - label: "The label" - ``` - - ### TSV - Should contain 'text' and 'label' columns separated by tabs. - - ### Excel (.xls, .xlsx) - Should contain 'text' and 'label' columns. - - ### SQLite (.db) - Should contain a table with 'text' and 'label' columns. - - ### Feather - Should contain 'text' and 'label' columns. - """ - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): - dataset = load_from_disk(dataset_path) - else: - data = [] - for filename in os.listdir(dataset_path): - filepath = os.path.join(dataset_path, filename) - if filename.endswith(".jsonl"): - with open(filepath, "r") as f: - for line in f: - example = json.loads(line) - data.append(example) - elif filename.endswith(".csv"): - df = pd.read_csv(filepath) - data.extend(df.to_dict("records")) - elif filename.endswith(".parquet"): - df = pq.read_table(filepath).to_pandas() - data.extend(df.to_dict("records")) - elif filename.endswith(".json"): - with open(filepath, "r") as f: - json_data = json.load(f) - data.extend(json_data) - elif filename.endswith(".xml"): - tree = ET.parse(filepath) - root = tree.getroot() - for record in root.findall("record"): - text = record.find("text").text # type: ignore - label = record.find("label").text # type: ignore - data.append({"text": text, "label": label}) - elif filename.endswith(".yaml") or filename.endswith(".yml"): - with open(filepath, "r") as f: - yaml_data = yaml.safe_load(f) - data.extend(yaml_data) - elif filename.endswith(".tsv"): - df = pd.read_csv(filepath, sep="\t") - data.extend(df.to_dict("records")) - elif filename.endswith((".xls", ".xlsx")): - df = pd.read_excel(filepath) - data.extend(df.to_dict("records")) - elif filename.endswith(".db"): - conn = sqlite3.connect(filepath) - query = "SELECT text, label FROM dataset_table;" - df = pd.read_sql_query(query, conn) - data.extend(df.to_dict("records")) - elif filename.endswith(".feather"): - df = feather.read_feather(filepath) - data.extend(df.to_dict("records")) - - if self.data_extractor_lambda: - fn = eval(self.data_extractor_lambda) - data = [fn(d) for d in data] - else: - data = data - - dataset = Dataset.from_pandas(pd.DataFrame(data)) - - tokenized_dataset = dataset.map( - self.prepare_train_features, - batched=True, - remove_columns=dataset.column_names, - ) - return tokenized_dataset - - def prepare_train_features(self, examples: Dict[str, Union[str, int]]) -> Dict[str, Union[List[int], int]]: - """ - Tokenize the examples and prepare the features for training. - - Args: - examples (Dict[str, Union[str, int]]): A dictionary of examples. - - Returns: - Dict[str, Union[List[int], int]]: The processed features. - """ - if not self.tokenizer: - raise Exception("No tokenizer found, please call load_models first.") - - tokenized_inputs = self.tokenizer(examples["text"], truncation=True, padding=False) - tokenized_inputs["labels"] = examples["label"] - return tokenized_inputs - - def data_collator( - self, examples: List[Dict[str, Union[List[int], int]]] - ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: - """ - Customize the data collator. - - Args: - examples (List[Dict[str, Union[List[int], int]]]): The examples to collate. - - Returns: - Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: The collated data. - """ - return DataCollatorWithPadding(self.tokenizer)(examples) diff --git a/tests/test_sentiment_analysis.py b/tests/test_sentiment_analysis.py deleted file mode 100644 index 4308a1a..0000000 --- a/tests/test_sentiment_analysis.py +++ /dev/null @@ -1,168 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import sqlite3 -import tempfile -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -import pytest -import yaml # type: ignore -from datasets import Dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import EvalPrediction - -from geniusrise_huggingface.sentiment_analysis import HuggingFaceSentimentAnalysisFineTuner - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [{"text": f"text_{i}", "label": i % 2} for i in range(10)] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "text").text = item["text"] - ET.SubElement(record, "label").text = str(item["label"]) - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=[ - "huggingface", - "csv", - "json", - "jsonl", - "parquet", - "xml", - "yaml", - "tsv", - "xlsx", - "db", - "feather", - ] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def sentiment_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") - state = InMemoryState() - klass = HuggingFaceSentimentAnalysisFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_name = "bert-base-uncased" - klass.tokenizer_name = "bert-base-uncased" - klass.model_class = "BertForSequenceClassification" - klass.tokenizer_class = "BertTokenizer" - return klass - - -def test_sentiment_bolt_init(sentiment_bolt): - sentiment_bolt.load_models() - assert sentiment_bolt.model is not None - assert sentiment_bolt.tokenizer is not None - assert sentiment_bolt.input is not None - assert sentiment_bolt.output is not None - assert sentiment_bolt.state is not None - - -def test_load_dataset_all_formats(sentiment_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - sentiment_bolt.load_models() - dataset = sentiment_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -def test_sentiment_bolt_fine_tune(sentiment_bolt, dataset_file): - tmpdir, ext = dataset_file - sentiment_bolt.input.input_folder = tmpdir - - sentiment_bolt.fine_tune( - model_name="bert-base-uncased", - tokenizer_name="bert-base-uncased", - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForSequenceClassification", - tokenizer_class="BertTokenizer", - evaluate=True, - ) - - output_dir = sentiment_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -def test_sentiment_bolt_compute_metrics(sentiment_bolt): - sentiment_bolt.load_models() - - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = sentiment_bolt.compute_metrics(eval_pred) - assert "accuracy" in metrics - assert "precision" in metrics - assert "recall" in metrics - assert "f1" in metrics