-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from vTuanpham/feat/Dialogs_config
Feat/dialogs config
- Loading branch information
Showing
18 changed files
with
515 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ on: | |
branches: | ||
- main | ||
- dev | ||
- feat/* | ||
jobs: | ||
test: | ||
runs-on: ubuntu-latest | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -159,4 +159,5 @@ cython_debug/ | |
#.idea/ | ||
|
||
*.json | ||
.idea/ | ||
.idea/ | ||
*.Identifier |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from .base_config import BaseConfig | ||
from .qa_config import QAConfig | ||
from .qa_config import QAConfig | ||
from .dialogs_config import DialogsConfig |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import sys | ||
sys.path.insert(0,r'./') | ||
import pprint | ||
from typing import List, Dict | ||
from dataclasses import dataclass, asdict, fields | ||
|
||
|
||
@dataclass | ||
class DialogsConfig: | ||
""" | ||
A single training/test example for conversation config. | ||
""" | ||
qas_id: str | ||
system_prompt: str | ||
|
||
user_prompts: list | ||
|
||
agent_responses: list = None | ||
|
||
answer_lengths: List[int] = None | ||
prompt_lengths: List[int] = None | ||
|
||
def __post_init__(self) -> None: | ||
# Post validate | ||
self.prompt_lengths = [len(prompt) for prompt in self.user_prompts] | ||
self.answer_lengths = [len(answer) for answer in self.agent_responses] | ||
|
||
def __str__(self) -> str: | ||
return self.__repr__ | ||
|
||
@staticmethod | ||
def intersect_lists(list1, list2): | ||
intersected = [] | ||
min_length = min(len(list1), len(list2)) | ||
|
||
for i in range(min_length): | ||
intersected.append(list1[i]) | ||
intersected.append(list2[i]) | ||
|
||
# Add remaining elements if any list is longer | ||
if len(list1) > len(list2): | ||
intersected.extend(list1[min_length:]) | ||
elif len(list2) > len(list1): | ||
intersected.extend(list2[min_length:]) | ||
|
||
return intersected | ||
|
||
@property | ||
def __repr__(self) -> str: | ||
s = "" | ||
s += f"\n Question id: {self.qas_id}" | ||
s += f"\n System prompt: {self.system_prompt}" | ||
s += f"\n Dialogs: \n" | ||
|
||
if self.user_prompts and self.agent_responses: | ||
final_dialogs = self.intersect_lists(self.user_prompts, self.agent_responses) | ||
final_dialogs_length = self.intersect_lists(self.prompt_lengths, self.answer_lengths) | ||
for idx, (dialog, length) in enumerate(zip(final_dialogs, final_dialogs_length)): | ||
s += f"Dialog {idx}: {dialog} \n" | ||
s += f"Dialog {idx} length: {length}\n" | ||
|
||
return s | ||
|
||
@property | ||
def get_dict(self) -> Dict: | ||
return asdict(self) | ||
|
||
@staticmethod | ||
def get_keys() -> List[str]: | ||
all_fields = fields(DialogsConfig) | ||
return [v.name for v in all_fields] | ||
|
||
@property | ||
def get_dict_str(self, indent: int=4) -> None: | ||
pp = pprint.PrettyPrinter(indent=indent) | ||
pp.pprint(self.get_dict) | ||
|
||
|
||
if __name__ == "__main__": | ||
example_dialog = {"qas_id": 10, | ||
"system_prompt": "You are an AI assistant, help as much as you can", | ||
"user_prompts": ["Tell me a bit about AI", "How does AI learn"], | ||
"agent_responses": ["Artificial Intelligence (AI) is a broad field focusing on creating systems or machines that can perform tasks that typically require human intelligence. It encompasses various subfields like machine learning, natural language processing, computer vision, robotics, and more. AI aims to simulate human cognitive functions, such as learning, problem-solving, perception, reasoning, and decision-making.", | ||
'''AI learning primarily occurs through machine learning algorithms. There are a few key ways in which AI learns: | ||
Supervised Learning: This method involves training AI models on labeled data. The algorithm learns patterns and associations between input data and corresponding output labels. For instance, in image recognition, showing the AI images labeled as "cat" or "dog" helps it learn to differentiate between the two. | ||
Unsupervised Learning: Here, the AI learns from data without labeled outcomes. It looks for patterns, structures, or relationships within the data. Clustering algorithms, for example, can group similar data points together without prior labeling. | ||
Reinforcement Learning: This method involves the AI learning through trial and error by interacting with an environment. It receives feedback in the form of rewards or penalties based on its actions. The AI's goal is to maximize cumulative reward, learning optimal strategies by exploring different actions. | ||
Transfer Learning: This technique involves transferring knowledge learned from one task to another. Models pre-trained on vast amounts of data for one task can be fine-tuned or adapted to perform related tasks more effectively with smaller datasets. | ||
AI learns by adjusting internal parameters or features in its algorithms to minimize errors or differences between predicted and actual outputs. This adjustment process, often referred to as "training," involves feeding the AI large amounts of data, iterating through algorithms, and refining the model's predictions or actions over time. | ||
As AI continues to evolve, researchers are exploring new learning methodologies to enhance its capabilities, making it more adaptable, efficient, and capable of handling complex tasks across various domains.'''], | ||
} | ||
dialog_config_data = DialogsConfig(**example_dialog) | ||
print(dialog_config_data) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import json | ||
import random | ||
import sys | ||
sys.path.insert(0,r'./') | ||
from tqdm.auto import tqdm | ||
|
||
from configs import DialogsConfig | ||
from translator import DataParser | ||
|
||
|
||
PARSER_NAME = "ShareGPT_V3" | ||
|
||
|
||
class ShareGPTV3(DataParser): | ||
def __init__(self, file_path: str, output_path: str, target_lang: str="vi", | ||
max_example_per_thread=300, large_chunks_threshold=20000, | ||
max_list_length_per_thread=3): | ||
super().__init__(file_path, output_path, | ||
parser_name=PARSER_NAME, | ||
do_translate=True, | ||
target_config=DialogsConfig, | ||
target_fields=['user_prompts', 'agent_responses'], | ||
target_lang=target_lang, | ||
max_example_per_thread=max_example_per_thread, | ||
large_chunks_threshold=large_chunks_threshold, | ||
max_list_length_per_thread=max_list_length_per_thread) | ||
|
||
# Read function must assign data that has been read to self.data_read | ||
def read(self) -> None: | ||
# The read function must call the read function in DataParser class | ||
# I just want to be sure that the file path is correct | ||
super(ShareGPTV3, self).read() | ||
|
||
with open(self.file_path, encoding='utf-8') as jfile: | ||
json_data = json.load(jfile) | ||
|
||
self.data_read = json_data | ||
return None | ||
|
||
def convert(self) -> None: | ||
# The convert function must call the convert function in DataParser class | ||
# I just want to be sure the read function has actually assigned the self.data_read | ||
super(ShareGPTV3, self).convert() | ||
|
||
data_converted = [] | ||
for data in tqdm(self.data_read, desc="Converting data"): | ||
data_dict = {} | ||
data_dict['system_prompt'] = "" | ||
data_dict['qas_id'] = data['id'] | ||
|
||
user_prompts = [] | ||
agent_responses = [] | ||
for conversation in data['conversations']: | ||
if conversation["from"] == "human": | ||
user_prompts.append(conversation['value']) | ||
if conversation["from"] == "gpt": | ||
agent_responses.append(conversation['value']) | ||
|
||
data_dict['user_prompts'] = user_prompts | ||
data_dict['agent_responses'] = agent_responses | ||
|
||
data_dict['prompt_lengths'] = None | ||
data_dict['answer_lengths'] = None | ||
data_converted.append(data_dict) | ||
|
||
# Be sure to assign the final data list to self.converted_data | ||
self.converted_data = data_converted[:5000] | ||
|
||
return None | ||
|
||
|
||
if __name__ == '__main__': | ||
share_gpt_v3_parser = ShareGPTV3(r"examples/ShareGPTV3/ShareGPT_V3_unfiltered_cleaned_split.json", | ||
r"examples/ShareGPTV3") | ||
share_gpt_v3_parser.read() | ||
share_gpt_v3_parser.convert() | ||
share_gpt_v3_parser.save |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.