-
Notifications
You must be signed in to change notification settings - Fork 629
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* base * i am a banana * fix example * generate_dataset_from_docs * wip * wip * wip * wip * a draft code for a RAG dataset generation * generate_dataset_from_docs * generate_dataset_from_docs * generate_dataset_from_docs * readable version * wip * wip * wip * generate_dataset_from_docs * prompts * generate_dataset_from_docs * generate_dataset_from_docs * fixes * fixes * generate_dataset_from_docs * chunks count * async * generate_dataset_from_docs * generate_dataset_from_docs * generate_dataset_from_docs * generate_dataset_from_docs * generate_dataset_from_docs * WIP * fix * rename * fix import * move system prompts into user * generate_dataset_from_docs * generate_dataset_from_docs * generate_dataset_from_docs * generate_dataset_from_docs * prompt function signature * function signature validation * requirements * requirements * requirements * requirements * requirements * requirements * lil cleanup * mypy * move, add splitter * fix example and deps * rename * lint * lint * audit * type aliases and audit * type aliases and sudit and llm util refactor and stuff * fix import * pip audit * pip audit * reg * pip audit * remove --------- Co-authored-by: Svetlana Popova <[email protected]> Co-authored-by: Emeli Dral <[email protected]>
- Loading branch information
1 parent
adc7c51
commit 8a05265
Showing
22 changed files
with
1,204 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from evidently.experimental.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator | ||
from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider | ||
from evidently.options.base import Options | ||
|
||
|
||
def generate_from_file(): | ||
file_path = "../cloud_quickstart_tracing.pdf" | ||
data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20, splitter="simple") | ||
|
||
generator = QADatasetGenerator( | ||
data_collection=data, | ||
provider="openai", | ||
model="gpt-4o-mini", | ||
num_questions=5, | ||
options=Options.from_any_options(None) | ||
) | ||
generated = generator.generate() | ||
for _, a in generated.iterrows(): | ||
print("Q", a["questions"]) | ||
if "answers" in a: | ||
print("A", a["answers"]) | ||
if "context" in a: | ||
print("C", a["context"]) | ||
print() | ||
|
||
|
||
def main(): | ||
data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"]) | ||
generator = QADatasetGenerator( | ||
data_collection=data, | ||
provider="openai", | ||
model="gpt-4o-mini", | ||
num_questions=5, | ||
options=Options.from_any_options(None) | ||
) | ||
|
||
generated = generator.generate() | ||
for _, a in generated.iterrows(): | ||
print("Q", a["questions"]) | ||
if "answers" in a: | ||
print("A", a["answers"]) | ||
if "context" in a: | ||
print("C", a["context"]) | ||
print() | ||
|
||
generator = QADatasetFromSeedGenerator( | ||
seed_question="What is 'kek'?", | ||
num_questions=5, | ||
provider="openai", | ||
model="gpt-4o-mini", | ||
options=Options.from_any_options(None) | ||
) | ||
|
||
generated = generator.generate() | ||
for _, a in generated.iterrows(): | ||
print("Q", a["questions"]) | ||
if "answers" in a: | ||
print("A", a["answers"]) | ||
if "context" in a: | ||
print("C", a["context"]) | ||
print() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
# generate_from_file() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,3 +31,4 @@ openai==1.16.2 | |
evaluate==0.4.1 | ||
transformers[torch]==4.39.3 | ||
sentence-transformers==2.7.0 | ||
chromadb==0.4.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from . import _registry | ||
|
||
__all__ = ["_registry"] |
67 changes: 67 additions & 0 deletions
67
src/evidently/experimental/dataset_generators/_registry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from evidently.experimental.dataset_generators.base import BaseDatasetGenerator | ||
from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider | ||
from evidently.experimental.dataset_generators.llm.splitter import Splitter | ||
from evidently.pydantic_utils import register_type_alias | ||
from evidently.utils.llm.prompts import PromptTemplate | ||
|
||
register_type_alias( | ||
BaseDatasetGenerator, | ||
"evidently.experimental.dataset_generators.llm.questions.QADatasetFromSeedGenerator", | ||
"evidently:dataset_generator:QADatasetFromSeedGenerator", | ||
) | ||
register_type_alias( | ||
BaseDatasetGenerator, | ||
"evidently.experimental.dataset_generators.llm.questions.QADatasetGenerator", | ||
"evidently:dataset_generator:QADatasetGenerator", | ||
) | ||
register_type_alias( | ||
DataCollectionProvider, | ||
"evidently.experimental.dataset_generators.llm.index.ChunksDataCollectionProvider", | ||
"evidently:data_collecton_provider:ChunksDataCollectionProvider", | ||
) | ||
register_type_alias( | ||
DataCollectionProvider, | ||
"evidently.experimental.dataset_generators.llm.index.FileDataCollectionProvider", | ||
"evidently:data_collecton_provider:FileDataCollectionProvider", | ||
) | ||
|
||
register_type_alias( | ||
PromptTemplate, | ||
"evidently.experimental.dataset_generators.llm.prompts.BaselineAnswerPromptTemplate", | ||
"evidently:prompt_template:BaselineAnswerPromptTemplate", | ||
) | ||
register_type_alias( | ||
PromptTemplate, | ||
"evidently.experimental.dataset_generators.llm.prompts.NaiveQuestionsFromContextPromptTemplate", | ||
"evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate", | ||
) | ||
register_type_alias( | ||
PromptTemplate, | ||
"evidently.experimental.dataset_generators.llm.prompts.QuestionsFromContextPromptTemplate", | ||
"evidently:prompt_template:QuestionsFromContextPromptTemplate", | ||
) | ||
register_type_alias( | ||
PromptTemplate, | ||
"evidently.experimental.dataset_generators.llm.prompts.QuestionsFromSeedPromptTemplate", | ||
"evidently:prompt_template:QuestionsFromSeedPromptTemplate", | ||
) | ||
register_type_alias( | ||
PromptTemplate, | ||
"evidently.experimental.dataset_generators.llm.prompts.ReformulateQuestionPromptTemplate", | ||
"evidently:prompt_template:ReformulateQuestionPromptTemplate", | ||
) | ||
register_type_alias( | ||
PromptTemplate, | ||
"evidently.experimental.dataset_generators.llm.prompts.SimpleQuestionPromptTemplate", | ||
"evidently:prompt_template:SimpleQuestionPromptTemplate", | ||
) | ||
register_type_alias( | ||
Splitter, | ||
"evidently.experimental.dataset_generators.llm.splitter.LlamaIndexSplitter", | ||
"evidently:splitter:LlamaIndexSplitter", | ||
) | ||
register_type_alias( | ||
Splitter, | ||
"evidently.experimental.dataset_generators.llm.splitter.SimpleSplitter", | ||
"evidently:splitter:SimpleSplitter", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from abc import ABC | ||
from abc import abstractmethod | ||
|
||
import pandas as pd | ||
from typing_extensions import TypeAlias | ||
|
||
from evidently.options.base import Options | ||
from evidently.pydantic_utils import EvidentlyBaseModel | ||
|
||
DatasetGeneratorResult: TypeAlias = pd.DataFrame | ||
|
||
|
||
class BaseDatasetGenerator(EvidentlyBaseModel, ABC): | ||
class Config: | ||
is_base_type = True | ||
|
||
options: Options | ||
|
||
@abstractmethod | ||
def generate(self) -> DatasetGeneratorResult: | ||
raise NotImplementedError |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from typing import Optional | ||
|
||
from evidently._pydantic_compat import PrivateAttr | ||
from evidently.experimental.dataset_generators.base import BaseDatasetGenerator | ||
from evidently.options.base import Options | ||
from evidently.utils.llm.wrapper import LLMWrapper | ||
from evidently.utils.llm.wrapper import get_llm_wrapper | ||
|
||
|
||
class BaseLLMDatasetGenerator(BaseDatasetGenerator): | ||
provider: str | ||
model: str | ||
_llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None) | ||
|
||
def get_llm_wrapper(self, options: Options) -> LLMWrapper: | ||
if self._llm_wrapper is None: | ||
self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options) | ||
return self._llm_wrapper | ||
|
||
@property | ||
def wrapper(self): | ||
return self.get_llm_wrapper(self.options) |
Oops, something went wrong.