Skip to content

Commit

Permalink
fix, chore: fix TypeError on re, update dialogs_config
Browse files Browse the repository at this point in the history
  • Loading branch information
vTuanpham committed Dec 10, 2023
1 parent 2fc2dd3 commit 078e32d
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 22 deletions.
60 changes: 48 additions & 12 deletions configs/dialogs_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,42 @@ class DialogsConfig:

def __post_init__(self) -> None:
# Post validate
self.answer_lengths = [len(answer) for answer in self.answers_list]
self.prompt_lengths = [len(prompt) for prompt in self.user_prompts]
self.answer_lengths = [len(answer) for answer in self.agent_responses]

def __str__(self) -> str:
return self.__repr__

@staticmethod
def intersect_lists(list1, list2):
intersected = []
min_length = min(len(list1), len(list2))

for i in range(min_length):
intersected.append(list1[i])
intersected.append(list2[i])

# Add remaining elements if any list is longer
if len(list1) > len(list2):
intersected.extend(list1[min_length:])
elif len(list2) > len(list1):
intersected.extend(list2[min_length:])

return intersected

@property
def __repr__(self) -> str:
s = ""
s += f"\n Question id: {self.qas_id}"
s += f"\n System prompt: {self.system_prompt}"
s += f"\n Question: {self.question_text}"
if self.context_list:
s += "\n Context list: \n"
for context, length in zip(self.context_list, self.context_lengths):
s += f"{context}\n"
s += f"Context length: {length}\n\n"
if self.answers_list:
s += "\n Answer list: \n"
for answer, length in zip(self.answers_list, self.answer_lengths):
s += f"{answer}\n"
s += f"Answer length: {length}\n\n"
s += f"\n Dialogs: \n"

if self.user_prompts and self.agent_responses:
final_dialogs = self.intersect_lists(self.user_prompts, self.agent_responses)
final_dialogs_length = self.intersect_lists(self.prompt_lengths, self.answer_lengths)
for idx, (dialog, length) in enumerate(zip(final_dialogs, final_dialogs_length)):
s += f"Dialog {idx}: {dialog} \n"
s += f"Dialog {idx} length: {length}\n"

return s

Expand All @@ -62,4 +76,26 @@ def get_dict_str(self, indent: int=4) -> None:
pp.pprint(self.get_dict)


if __name__ == "__main__":
example_dialog = {"qas_id": 10,
"system_prompt": "You are an AI assistant, help as much as you can",
"user_prompts": ["Tell me a bit about AI", "How does AI learn"],
"agent_responses": ["Artificial Intelligence (AI) is a broad field focusing on creating systems or machines that can perform tasks that typically require human intelligence. It encompasses various subfields like machine learning, natural language processing, computer vision, robotics, and more. AI aims to simulate human cognitive functions, such as learning, problem-solving, perception, reasoning, and decision-making.",
'''AI learning primarily occurs through machine learning algorithms. There are a few key ways in which AI learns:
Supervised Learning: This method involves training AI models on labeled data. The algorithm learns patterns and associations between input data and corresponding output labels. For instance, in image recognition, showing the AI images labeled as "cat" or "dog" helps it learn to differentiate between the two.
Unsupervised Learning: Here, the AI learns from data without labeled outcomes. It looks for patterns, structures, or relationships within the data. Clustering algorithms, for example, can group similar data points together without prior labeling.
Reinforcement Learning: This method involves the AI learning through trial and error by interacting with an environment. It receives feedback in the form of rewards or penalties based on its actions. The AI's goal is to maximize cumulative reward, learning optimal strategies by exploring different actions.
Transfer Learning: This technique involves transferring knowledge learned from one task to another. Models pre-trained on vast amounts of data for one task can be fine-tuned or adapted to perform related tasks more effectively with smaller datasets.
AI learns by adjusting internal parameters or features in its algorithms to minimize errors or differences between predicted and actual outputs. This adjustment process, often referred to as "training," involves feeding the AI large amounts of data, iterating through algorithms, and refining the model's predictions or actions over time.
As AI continues to evolve, researchers are exploring new learning methodologies to enhance its capabilities, making it more adaptable, efficient, and capable of handling complex tasks across various domains.'''],
}
dialog_config_data = DialogsConfig(**example_dialog)
print(dialog_config_data)


2 changes: 1 addition & 1 deletion examples/ELI5/ELI5_10docs_Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def convert(self) -> None:
r"examples/ELI5",
max_example_per_thread=100,
large_chunks_threshold=1000,
target_lang="ko")
target_lang="ru")
eli5_val_parser.read()
eli5_val_parser.convert()
eli5_val_parser.save
3 changes: 2 additions & 1 deletion tests/eli5_qaconfig_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
import unittest
import warnings

import sys
sys.path.insert(0,r'./')
from datasets import load_dataset

from examples.ELI5.ELI5_10_docs_QAConfigParser import ELI5ValQAConfig
Expand Down
2 changes: 2 additions & 0 deletions tests/eli5_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import unittest
import warnings
import sys
sys.path.insert(0,r'./')

from datasets import load_dataset

Expand Down
19 changes: 11 additions & 8 deletions translator/data_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ def extract_texts(obj):

return {'text_list': target_texts, 'key': sub_list_idx} if sub_list_idx is not None else target_texts

@timeit
def translate_converted(self, en_data: List[str] = None,
def translate_converted(self,
en_data: List[str] = None,
desc: str = None,
translator: Translator = None,
large_chunk: List[str] = None) -> Union[None, List[str]]:
Expand Down Expand Up @@ -332,7 +332,10 @@ def callback_done(future):

for idx, chunk in enumerate(chunks):
# Assign each thread with a new Translator instance
future_chunk = executor.submit(self.translate_converted, chunk, f"chunk {idx}", Translator())
future_chunk = executor.submit(self.translate_converted,
en_data=chunk,
desc=f"chunk {idx}",
translator=Translator())
future_chunk.add_done_callback(callback_done)
future_dict = {
"future": future_chunk,
Expand All @@ -356,8 +359,10 @@ def callback_done(future):
if future_dict['future'].exception():
tqdm.write(
f"\n Thread {future_dict['idx']} failed, restarting thread with chunk {future_dict['idx']}\n")
backup_future_chunk = executor.submit(self.translate_converted, chunks[future_dict['idx']],
f"Backup chunk {future_dict['idx']}", Translator())
backup_future_chunk = executor.submit(self.translate_converted,
en_data=chunks[future_dict['idx']],
desc=f"Backup chunk {future_dict['idx']}",
translator=Translator())
backup_future_chunk.add_done_callback(callback_done)
backup_future_dict = {"future": backup_future_chunk,
"idx": future_dict['idx']}
Expand All @@ -381,9 +386,7 @@ def callback_done(future):
for example in tqdm(converted_data, desc=progress_bar_desc, colour="blue"):
translated_data_example = self.translate_en2vi_advance_qa(example,
translator,
progress_idx=int(re.findall(r'\d+', desc)[
0]) if re.findall(r'\d+',
desc) else 0)
progress_idx=int(re.findall(r'\d+', desc)[0]) if desc and re.findall(r'\d+', desc) else 0)
translated_data.append(translated_data_example)
if en_data: return translated_data
if large_chunk:
Expand Down

0 comments on commit 078e32d

Please sign in to comment.