Skip to content

Commit

Permalink
Feature/dataset generator (#1340)
Browse files Browse the repository at this point in the history
* base

* i am a banana

* fix example

* generate_dataset_from_docs

* wip

* wip

* wip

* wip

* a draft code for a RAG dataset generation

* generate_dataset_from_docs

* generate_dataset_from_docs

* generate_dataset_from_docs

* readable version

* wip

* wip

* wip

* generate_dataset_from_docs

* prompts

* generate_dataset_from_docs

* generate_dataset_from_docs

* fixes

* fixes

* generate_dataset_from_docs

* chunks count

* async

* generate_dataset_from_docs

* generate_dataset_from_docs

* generate_dataset_from_docs

* generate_dataset_from_docs

* generate_dataset_from_docs

* WIP

* fix

* rename

* fix import

* move system prompts into user

* generate_dataset_from_docs

* generate_dataset_from_docs

* generate_dataset_from_docs

* generate_dataset_from_docs

* prompt function signature

* function signature validation

* requirements

* requirements

* requirements

* requirements

* requirements

* requirements

* lil cleanup

* mypy

* move, add splitter

* fix example and deps

* rename

* lint

* lint

* audit

* type aliases and audit

* type aliases and sudit and llm util refactor and stuff

* fix import

* pip audit

* pip audit

* reg

* pip audit

* remove

---------

Co-authored-by: Svetlana Popova <[email protected]>
Co-authored-by: Emeli Dral <[email protected]>
  • Loading branch information
3 people authored Oct 16, 2024
1 parent adc7c51 commit 8a05265
Show file tree
Hide file tree
Showing 22 changed files with 1,204 additions and 5 deletions.
11 changes: 7 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,12 @@ jobs:
- name: Install minimal dependencies
run: pip install -r requirements.min.txt
- name: Install package
run: pip install -e .[dev,spark,fsspec]
run: pip install -e .[dev,spark,fsspec,llm]
- name: Run pip-audit
run: pip-audit --ignore-vuln PYSEC-2024-48 --ignore-vuln GHSA-jw8x-6495-233v --ignore-vuln GHSA-4hq2-rpgc-r8r7
run: |
pip-audit \
--ignore-vuln GHSA-jw8x-6495-233v \
--ignore-vuln PYSEC-2024-38
- name: Run Tests
run: python -m pytest --durations=50
test:
Expand Down Expand Up @@ -155,7 +158,7 @@ jobs:
uses: ./.github/share-actions/get-bikes-dataset-cached

- name: Install package
run: pip install -e .[dev,spark,fsspec]
run: pip install -e .[dev,spark,fsspec,llm]
- name: Run Tests
run: python -m pytest --durations=50

Expand All @@ -173,7 +176,7 @@ jobs:
cache: "pip"
cache-dependency-path: setup.py
- name: Install dependencies
run: pip install -e ".[dev]"
run: pip install -e .
- name: Install wheel
run: pip install wheel
- name: Build package
Expand Down
66 changes: 66 additions & 0 deletions examples/data_generators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from evidently.experimental.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator
from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
from evidently.options.base import Options


def generate_from_file():
file_path = "../cloud_quickstart_tracing.pdf"
data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20, splitter="simple")

generator = QADatasetGenerator(
data_collection=data,
provider="openai",
model="gpt-4o-mini",
num_questions=5,
options=Options.from_any_options(None)
)
generated = generator.generate()
for _, a in generated.iterrows():
print("Q", a["questions"])
if "answers" in a:
print("A", a["answers"])
if "context" in a:
print("C", a["context"])
print()


def main():
data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
generator = QADatasetGenerator(
data_collection=data,
provider="openai",
model="gpt-4o-mini",
num_questions=5,
options=Options.from_any_options(None)
)

generated = generator.generate()
for _, a in generated.iterrows():
print("Q", a["questions"])
if "answers" in a:
print("A", a["answers"])
if "context" in a:
print("C", a["context"])
print()

generator = QADatasetFromSeedGenerator(
seed_question="What is 'kek'?",
num_questions=5,
provider="openai",
model="gpt-4o-mini",
options=Options.from_any_options(None)
)

generated = generator.generate()
for _, a in generated.iterrows():
print("Q", a["questions"])
if "answers" in a:
print("A", a["answers"])
if "context" in a:
print("C", a["context"])
print()


if __name__ == '__main__':
main()
# generate_from_file()
1 change: 1 addition & 0 deletions requirements.min.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ openai==1.16.2
evaluate==0.4.1
transformers[torch]==4.39.3
sentence-transformers==2.7.0
chromadb==0.4.0
9 changes: 9 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ ignore_missing_imports = True
[mypy-litellm.*]
ignore_missing_imports = True

[mypy-chromadb.*]
ignore_missing_imports = True

[mypy-llama_index.*]
ignore_missing_imports = True

[mypy-pypdf.*]
ignore_missing_imports = True

[tool:pytest]
testpaths=tests
python_classes=*Test
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
"types-python-dateutil==2.8.19",
"types-ujson>=5.4.0",
"pillow==10.3.0",
"httpx==0.24.1",
"httpx==0.27.0",
"ruff==0.3.7",
"pre-commit==3.5.0",
"pytest-asyncio==0.23.7",
Expand All @@ -102,6 +102,7 @@
"evaluate>=0.4.1",
"transformers[torch]>=4.39.3",
"sentence-transformers>=2.7.0",
"chromadb>=0.4.0",
],
"spark": ["pyspark>=3.4.0"],
"fsspec": [
Expand Down
3 changes: 3 additions & 0 deletions src/evidently/experimental/dataset_generators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import _registry

__all__ = ["_registry"]
67 changes: 67 additions & 0 deletions src/evidently/experimental/dataset_generators/_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
from evidently.experimental.dataset_generators.llm.splitter import Splitter
from evidently.pydantic_utils import register_type_alias
from evidently.utils.llm.prompts import PromptTemplate

register_type_alias(
BaseDatasetGenerator,
"evidently.experimental.dataset_generators.llm.questions.QADatasetFromSeedGenerator",
"evidently:dataset_generator:QADatasetFromSeedGenerator",
)
register_type_alias(
BaseDatasetGenerator,
"evidently.experimental.dataset_generators.llm.questions.QADatasetGenerator",
"evidently:dataset_generator:QADatasetGenerator",
)
register_type_alias(
DataCollectionProvider,
"evidently.experimental.dataset_generators.llm.index.ChunksDataCollectionProvider",
"evidently:data_collecton_provider:ChunksDataCollectionProvider",
)
register_type_alias(
DataCollectionProvider,
"evidently.experimental.dataset_generators.llm.index.FileDataCollectionProvider",
"evidently:data_collecton_provider:FileDataCollectionProvider",
)

register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.BaselineAnswerPromptTemplate",
"evidently:prompt_template:BaselineAnswerPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.NaiveQuestionsFromContextPromptTemplate",
"evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.QuestionsFromContextPromptTemplate",
"evidently:prompt_template:QuestionsFromContextPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.QuestionsFromSeedPromptTemplate",
"evidently:prompt_template:QuestionsFromSeedPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.ReformulateQuestionPromptTemplate",
"evidently:prompt_template:ReformulateQuestionPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.SimpleQuestionPromptTemplate",
"evidently:prompt_template:SimpleQuestionPromptTemplate",
)
register_type_alias(
Splitter,
"evidently.experimental.dataset_generators.llm.splitter.LlamaIndexSplitter",
"evidently:splitter:LlamaIndexSplitter",
)
register_type_alias(
Splitter,
"evidently.experimental.dataset_generators.llm.splitter.SimpleSplitter",
"evidently:splitter:SimpleSplitter",
)
21 changes: 21 additions & 0 deletions src/evidently/experimental/dataset_generators/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from abc import ABC
from abc import abstractmethod

import pandas as pd
from typing_extensions import TypeAlias

from evidently.options.base import Options
from evidently.pydantic_utils import EvidentlyBaseModel

DatasetGeneratorResult: TypeAlias = pd.DataFrame


class BaseDatasetGenerator(EvidentlyBaseModel, ABC):
class Config:
is_base_type = True

options: Options

@abstractmethod
def generate(self) -> DatasetGeneratorResult:
raise NotImplementedError
Empty file.
22 changes: 22 additions & 0 deletions src/evidently/experimental/dataset_generators/llm/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Optional

from evidently._pydantic_compat import PrivateAttr
from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
from evidently.options.base import Options
from evidently.utils.llm.wrapper import LLMWrapper
from evidently.utils.llm.wrapper import get_llm_wrapper


class BaseLLMDatasetGenerator(BaseDatasetGenerator):
provider: str
model: str
_llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)

def get_llm_wrapper(self, options: Options) -> LLMWrapper:
if self._llm_wrapper is None:
self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
return self._llm_wrapper

@property
def wrapper(self):
return self.get_llm_wrapper(self.options)
Loading

0 comments on commit 8a05265

Please sign in to comment.