diff --git a/configs/dialogs_config.py b/configs/dialogs_config.py index 54fd7d9..38f5da9 100644 --- a/configs/dialogs_config.py +++ b/configs/dialogs_config.py @@ -22,28 +22,42 @@ class DialogsConfig: def __post_init__(self) -> None: # Post validate - self.answer_lengths = [len(answer) for answer in self.answers_list] self.prompt_lengths = [len(prompt) for prompt in self.user_prompts] + self.answer_lengths = [len(answer) for answer in self.agent_responses] def __str__(self) -> str: return self.__repr__ + @staticmethod + def intersect_lists(list1, list2): + intersected = [] + min_length = min(len(list1), len(list2)) + + for i in range(min_length): + intersected.append(list1[i]) + intersected.append(list2[i]) + + # Add remaining elements if any list is longer + if len(list1) > len(list2): + intersected.extend(list1[min_length:]) + elif len(list2) > len(list1): + intersected.extend(list2[min_length:]) + + return intersected + @property def __repr__(self) -> str: s = "" s += f"\n Question id: {self.qas_id}" s += f"\n System prompt: {self.system_prompt}" - s += f"\n Question: {self.question_text}" - if self.context_list: - s += "\n Context list: \n" - for context, length in zip(self.context_list, self.context_lengths): - s += f"{context}\n" - s += f"Context length: {length}\n\n" - if self.answers_list: - s += "\n Answer list: \n" - for answer, length in zip(self.answers_list, self.answer_lengths): - s += f"{answer}\n" - s += f"Answer length: {length}\n\n" + s += f"\n Dialogs: \n" + + if self.user_prompts and self.agent_responses: + final_dialogs = self.intersect_lists(self.user_prompts, self.agent_responses) + final_dialogs_length = self.intersect_lists(self.prompt_lengths, self.answer_lengths) + for idx, (dialog, length) in enumerate(zip(final_dialogs, final_dialogs_length)): + s += f"Dialog {idx}: {dialog} \n" + s += f"Dialog {idx} length: {length}\n" return s @@ -62,4 +76,26 @@ def get_dict_str(self, indent: int=4) -> None: pp.pprint(self.get_dict) +if __name__ == "__main__": + example_dialog = {"qas_id": 10, + "system_prompt": "You are an AI assistant, help as much as you can", + "user_prompts": ["Tell me a bit about AI", "How does AI learn"], + "agent_responses": ["Artificial Intelligence (AI) is a broad field focusing on creating systems or machines that can perform tasks that typically require human intelligence. It encompasses various subfields like machine learning, natural language processing, computer vision, robotics, and more. AI aims to simulate human cognitive functions, such as learning, problem-solving, perception, reasoning, and decision-making.", + '''AI learning primarily occurs through machine learning algorithms. There are a few key ways in which AI learns: + +Supervised Learning: This method involves training AI models on labeled data. The algorithm learns patterns and associations between input data and corresponding output labels. For instance, in image recognition, showing the AI images labeled as "cat" or "dog" helps it learn to differentiate between the two. + +Unsupervised Learning: Here, the AI learns from data without labeled outcomes. It looks for patterns, structures, or relationships within the data. Clustering algorithms, for example, can group similar data points together without prior labeling. + +Reinforcement Learning: This method involves the AI learning through trial and error by interacting with an environment. It receives feedback in the form of rewards or penalties based on its actions. The AI's goal is to maximize cumulative reward, learning optimal strategies by exploring different actions. + +Transfer Learning: This technique involves transferring knowledge learned from one task to another. Models pre-trained on vast amounts of data for one task can be fine-tuned or adapted to perform related tasks more effectively with smaller datasets. + +AI learns by adjusting internal parameters or features in its algorithms to minimize errors or differences between predicted and actual outputs. This adjustment process, often referred to as "training," involves feeding the AI large amounts of data, iterating through algorithms, and refining the model's predictions or actions over time. + +As AI continues to evolve, researchers are exploring new learning methodologies to enhance its capabilities, making it more adaptable, efficient, and capable of handling complex tasks across various domains.'''], + } + dialog_config_data = DialogsConfig(**example_dialog) + print(dialog_config_data) + diff --git a/examples/ELI5/ELI5_10docs_Parser.py b/examples/ELI5/ELI5_10docs_Parser.py index c4cfba6..944cd70 100644 --- a/examples/ELI5/ELI5_10docs_Parser.py +++ b/examples/ELI5/ELI5_10docs_Parser.py @@ -128,7 +128,7 @@ def convert(self) -> None: r"examples/ELI5", max_example_per_thread=100, large_chunks_threshold=1000, - target_lang="ko") + target_lang="ru") eli5_val_parser.read() eli5_val_parser.convert() eli5_val_parser.save \ No newline at end of file diff --git a/tests/eli5_qaconfig_test.py b/tests/eli5_qaconfig_test.py index 6e4d291..851c275 100644 --- a/tests/eli5_qaconfig_test.py +++ b/tests/eli5_qaconfig_test.py @@ -1,7 +1,8 @@ import os import unittest import warnings - +import sys +sys.path.insert(0,r'./') from datasets import load_dataset from examples.ELI5.ELI5_10_docs_QAConfigParser import ELI5ValQAConfig diff --git a/tests/eli5_test.py b/tests/eli5_test.py index 1e514a7..0deebb6 100644 --- a/tests/eli5_test.py +++ b/tests/eli5_test.py @@ -1,6 +1,8 @@ import os import unittest import warnings +import sys +sys.path.insert(0,r'./') from datasets import load_dataset diff --git a/translator/data_parser.py b/translator/data_parser.py index 9a22fde..1a1b465 100644 --- a/translator/data_parser.py +++ b/translator/data_parser.py @@ -270,8 +270,8 @@ def extract_texts(obj): return {'text_list': target_texts, 'key': sub_list_idx} if sub_list_idx is not None else target_texts - @timeit - def translate_converted(self, en_data: List[str] = None, + def translate_converted(self, + en_data: List[str] = None, desc: str = None, translator: Translator = None, large_chunk: List[str] = None) -> Union[None, List[str]]: @@ -332,7 +332,10 @@ def callback_done(future): for idx, chunk in enumerate(chunks): # Assign each thread with a new Translator instance - future_chunk = executor.submit(self.translate_converted, chunk, f"chunk {idx}", Translator()) + future_chunk = executor.submit(self.translate_converted, + en_data=chunk, + desc=f"chunk {idx}", + translator=Translator()) future_chunk.add_done_callback(callback_done) future_dict = { "future": future_chunk, @@ -356,8 +359,10 @@ def callback_done(future): if future_dict['future'].exception(): tqdm.write( f"\n Thread {future_dict['idx']} failed, restarting thread with chunk {future_dict['idx']}\n") - backup_future_chunk = executor.submit(self.translate_converted, chunks[future_dict['idx']], - f"Backup chunk {future_dict['idx']}", Translator()) + backup_future_chunk = executor.submit(self.translate_converted, + en_data=chunks[future_dict['idx']], + desc=f"Backup chunk {future_dict['idx']}", + translator=Translator()) backup_future_chunk.add_done_callback(callback_done) backup_future_dict = {"future": backup_future_chunk, "idx": future_dict['idx']} @@ -381,9 +386,7 @@ def callback_done(future): for example in tqdm(converted_data, desc=progress_bar_desc, colour="blue"): translated_data_example = self.translate_en2vi_advance_qa(example, translator, - progress_idx=int(re.findall(r'\d+', desc)[ - 0]) if re.findall(r'\d+', - desc) else 0) + progress_idx=int(re.findall(r'\d+', desc)[0]) if desc and re.findall(r'\d+', desc) else 0) translated_data.append(translated_data_example) if en_data: return translated_data if large_chunk: