Skip to content

Commit

Permalink
Merge pull request #54 from A-Baji/dev
Browse files Browse the repository at this point in the history
3.0.8
  • Loading branch information
A-Baji authored Jul 12, 2024
2 parents bcb93ba + c0a9eae commit 0e7def1
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 55 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) convention.

## [3.0.8] - 07-11-2024

### Changed

- changed distributed selection mode to a flag: `--distributed`
- fix a small bug for distributed mode with offsets

## [3.0.7] - 07-11-2024

### Changed
Expand Down Expand Up @@ -133,6 +140,8 @@ Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and

- switched to `pathlib` for file path parsing

[3.0.8]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.7...3.0.8
[3.0.7]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.6...3.0.7
[3.0.6]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.5...3.0.6
[3.0.5]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.4...3.0.5
[3.0.4]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.3...3.0.4
Expand Down
2 changes: 1 addition & 1 deletion discordai_modelizer/command_line/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def read_modelizer_args(args, model_subcommand, job_subcommand):
thought_min=args.thought_min,
max_entry_count=args.max_entries,
offset=args.offset,
select_mode=args.select_mode,
distributed=args.distributed,
reverse=args.reverse,
base_model=args.base_model,
clean=args.dirty,
Expand Down
10 changes: 4 additions & 6 deletions discordai_modelizer/command_line/subparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,11 @@ def setup_model_create(model_subcommand, is_parent=False):
help="The offset by line index starting at 0 for where to start selecting lines for the dataset: DEFAULT=0",
)
model_create_optional_named.add_argument(
"-s",
"--select-mode",
choices=["sequential", "distributed"],
default="sequential",
"--distributed",
action="store_true",
required=False,
dest="select_mode",
help="The method to select lines for the dataset, where `sequential` mode will select lines in chronological order, while `distributed` mode will select an even distribution of lines: DEFAULT=sequential",
dest="distributed",
help="Select lines as an even distribution instead of sequentially",
)
model_create_optional_named.add_argument(
"--reverse_lines",
Expand Down
4 changes: 2 additions & 2 deletions discordai_modelizer/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_model(
thought_min=4,
max_entry_count=1000,
offset=0,
select_mode="sequential",
distributed=False,
base_model="none",
reverse=False,
clean=False,
Expand Down Expand Up @@ -99,7 +99,7 @@ def create_model(
except UserNotFoundError as e:
print(f"ERROR: {e}")
return
get_lines(full_dataset_path, max_entry_count, offset, select_mode, reverse)
get_lines(full_dataset_path, max_entry_count, offset, distributed, reverse)
if not clean:
print(f"INFO: Dataset saved to {full_dataset_path}")

Expand Down
10 changes: 4 additions & 6 deletions discordai_modelizer/gen_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,19 +123,17 @@ def add_to_dataset(thought: str):
)


def get_lines(
file_name: str, N=1000, offset=0, select_mode="sequential", reverse=False
):
def get_lines(file_name: str, N=1000, offset=0, distributed=False, reverse=False):
with open(file_name, "r") as f:
lines = f.readlines()
f.close()

num_lines = len(lines)

if select_mode == "sequential":
step = 1
if distributed:
step = (num_lines - offset) // N
else:
step = num_lines // N
step = 1

if reverse:
lines = lines[::-1]
Expand Down
2 changes: 1 addition & 1 deletion discordai_modelizer/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.0.7"
__version__ = "3.0.8"
130 changes: 92 additions & 38 deletions tests/expected_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
{"id": "gpt-3.5-turbo", "created": "2023-02-28 18:56:42"},
{"id": "gpt-3.5-turbo-instruct", "created": "2023-08-24 18:23:47"},
{"id": "gpt-3.5-turbo-instruct-0914", "created": "2023-09-07 21:34:32"},
{"id": "text-embedding-ada-002", "created": "2022-12-16 19:01:39"},
{"id": "gpt-4o", "created": "2024-05-10 18:50:49"},
{"id": "text-embedding-ada-002", "created": "2022-12-16 19:01:39"},
{"id": "davinci-002", "created": "2023-08-21 16:11:41"},
{
"id": "davinci:ft-personal:jason-9582-2022-12-23-05-45-51",
Expand Down Expand Up @@ -194,6 +194,18 @@
"id": "ft:davinci-002:personal:1663-wardellstephe:9jgCpXW5",
"created": "2024-07-11 05:00:27",
},
{
"id": "ft:davinci-002:personal:wardellstephe-1663:9jxGWoJw:ckpt-step-769",
"created": "2024-07-11 23:13:25",
},
{
"id": "ft:davinci-002:personal:wardellstephe-1663:9jxGXXBT:ckpt-step-1538",
"created": "2024-07-11 23:13:25",
},
{
"id": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS",
"created": "2024-07-11 23:13:25",
},
]

list_module_expected_full = [
Expand Down Expand Up @@ -341,18 +353,18 @@
"object": "model",
"owned_by": "system",
},
{
"id": "text-embedding-ada-002",
"created": "2022-12-16 19:01:39",
"object": "model",
"owned_by": "openai-internal",
},
{
"id": "gpt-4o",
"created": "2024-05-10 18:50:49",
"object": "model",
"owned_by": "system",
},
{
"id": "text-embedding-ada-002",
"created": "2022-12-16 19:01:39",
"object": "model",
"owned_by": "openai-internal",
},
{
"id": "davinci-002",
"created": "2023-08-21 16:11:41",
Expand Down Expand Up @@ -611,9 +623,34 @@
"object": "model",
"owned_by": "user-ygljrfw1bneby79ndybpnodl",
},
{
"id": "ft:davinci-002:personal:wardellstephe-1663:9jxGWoJw:ckpt-step-769",
"created": "2024-07-11 23:13:25",
"object": "model",
"owned_by": "user-ygljrfw1bneby79ndybpnodl",
},
{
"id": "ft:davinci-002:personal:wardellstephe-1663:9jxGXXBT:ckpt-step-1538",
"created": "2024-07-11 23:13:25",
"object": "model",
"owned_by": "user-ygljrfw1bneby79ndybpnodl",
},
{
"id": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS",
"created": "2024-07-11 23:13:25",
"object": "model",
"owned_by": "user-ygljrfw1bneby79ndybpnodl",
},
]

list_job_expected = [
{
"id": "ftjob-6DFW30DOSQAZaVQ4n3fZ9U4w",
"model": "davinci-002",
"status": "succeeded",
"created_at": "2024-07-11 23:01:08",
"finished_at": "2024-07-11 23:13:23",
},
{
"id": "ftjob-PJ6OUUoontOdeUlGmNDBvnfx",
"model": "davinci-002",
Expand Down Expand Up @@ -747,16 +784,33 @@
"created_at": "2024-06-23 22:46:25",
"finished_at": "2024-06-23 22:49:44",
},
{
"id": "ftjob-WMnwxBtBuJQgiX3ZRnuXlTF3",
"model": "gpt-3.5-turbo-0125",
"status": "succeeded",
"created_at": "2024-06-23 20:31:21",
"finished_at": "2024-06-23 21:16:09",
},
]

list_job_expected_full = [
{
"id": "ftjob-6DFW30DOSQAZaVQ4n3fZ9U4w",
"created_at": "2024-07-11 23:01:08",
"error": {"code": None, "message": None, "param": None},
"fine_tuned_model": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS",
"finished_at": "2024-07-11 23:13:23",
"hyperparameters": {
"n_epochs": 2,
"batch_size": 13,
"learning_rate_multiplier": 16,
},
"model": "davinci-002",
"object": "fine_tuning.job",
"organization_id": "org-UN3Y7fSoTFPVCWR6f4AMWsVf",
"result_files": ["file-dWjaHOWq5CWsJFUrAWJrZYH0"],
"seed": 706919378,
"status": "succeeded",
"trained_tokens": 348890,
"training_file": "file-9x6n6frPWT8mAx7BeuV0MNQM",
"validation_file": None,
"estimated_finish": None,
"integrations": [],
"user_provided_suffix": "wardellstephe_1663",
},
{
"id": "ftjob-PJ6OUUoontOdeUlGmNDBvnfx",
"created_at": "2024-07-11 04:48:11",
Expand Down Expand Up @@ -1261,30 +1315,6 @@
"integrations": [],
"user_provided_suffix": "1061_adibiswat",
},
{
"id": "ftjob-WMnwxBtBuJQgiX3ZRnuXlTF3",
"created_at": "2024-06-23 20:31:21",
"error": {"code": None, "message": None, "param": None},
"fine_tuned_model": "ft:gpt-3.5-turbo-0125:personal:1663-adibiswat:9dOrDhxp",
"finished_at": "2024-06-23 21:16:09",
"hyperparameters": {
"n_epochs": 3,
"batch_size": 1,
"learning_rate_multiplier": 2,
},
"model": "gpt-3.5-turbo-0125",
"object": "fine_tuning.job",
"organization_id": "org-UN3Y7fSoTFPVCWR6f4AMWsVf",
"result_files": ["file-8ya7wn36JZz6cc9tEy3diyE3"],
"seed": 1834725185,
"status": "succeeded",
"trained_tokens": 76449,
"training_file": "file-RhHsfdxIq0v1AnstxhULFSvO",
"validation_file": None,
"estimated_finish": None,
"integrations": [],
"user_provided_suffix": "1663_adibiswat",
},
]

job_info_expected = {
Expand Down Expand Up @@ -2160,3 +2190,27 @@
{"prompt": "adibiswat says:", "completion": " message."},
{"prompt": "adibiswat says:", "completion": " message."},
]

gen_dataset_max_5_distributed_offset_2 = [
{"prompt": "adibiswat says:", "completion": " a."},
{"prompt": "adibiswat says:", "completion": " ai tests."},
{"prompt": "adibiswat says:", "completion": " pretermined messages."},
{"prompt": "adibiswat says:", "completion": " another."},
{"prompt": "adibiswat says:", "completion": " also."},
]

gen_dataset_max_5_distributed_reverse = [
{"prompt": "adibiswat says:", "completion": " the last message."},
{"prompt": "adibiswat says:", "completion": " message."},
{"prompt": "adibiswat says:", "completion": " message."},
{"prompt": "adibiswat says:", "completion": " pretermined messages."},
{"prompt": "adibiswat says:", "completion": " discord."},
]

gen_dataset_max_5_distributed_reverse_offset_2 = [
{"prompt": "adibiswat says:", "completion": " message."},
{"prompt": "adibiswat says:", "completion": " a."},
{"prompt": "adibiswat says:", "completion": " message."},
{"prompt": "adibiswat says:", "completion": " for the test data."},
{"prompt": "adibiswat says:", "completion": " i."},
]
43 changes: 42 additions & 1 deletion tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def test_gen_dataset_distributed(default_file_output):
gen_dataset.parse_logs(
FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
)
gen_dataset.get_lines(FULL_DATASET_PATH, N=5, select_mode="distributed")
gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True)
with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
list_dict_comp(
expected_values.gen_dataset_max_5_distributed,
Expand All @@ -154,6 +154,47 @@ def test_gen_dataset_distributed(default_file_output):
data_file.close()


def test_gen_dataset_distributed_offset(default_file_output):
gen_dataset.parse_logs(
FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
)
gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True, offset=2)
with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
list_dict_comp(
expected_values.gen_dataset_max_5_distributed_offset_2,
[loads(line) for line in data_file],
)
data_file.close()


def test_gen_dataset_distributed_reverse(default_file_output):
gen_dataset.parse_logs(
FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
)
gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True, reverse=True)
with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
list_dict_comp(
expected_values.gen_dataset_max_5_distributed_reverse,
[loads(line) for line in data_file],
)
data_file.close()


def test_gen_dataset_distributed_reverse_offset(default_file_output):
gen_dataset.parse_logs(
FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
)
gen_dataset.get_lines(
FULL_DATASET_PATH, N=5, distributed=True, reverse=True, offset=2
)
with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
list_dict_comp(
expected_values.gen_dataset_max_5_distributed_reverse_offset_2,
[loads(line) for line in data_file],
)
data_file.close()


def test_parse_logs_user_not_found(default_file_output):
username = "bad_username"
with raises(
Expand Down

0 comments on commit 0e7def1

Please sign in to comment.