diff --git a/CHANGELOG.md b/CHANGELOG.md index bf5c91e..4cb4a5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) convention. +## [3.0.8] - 07-11-2024 + +### Changed + +- changed distributed selection mode to a flag: `--distributed` +- fix a small bug for distributed mode with offsets + ## [3.0.7] - 07-11-2024 ### Changed @@ -133,6 +140,8 @@ Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and - switched to `pathlib` for file path parsing +[3.0.8]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.7...3.0.8 +[3.0.7]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.6...3.0.7 [3.0.6]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.5...3.0.6 [3.0.5]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.4...3.0.5 [3.0.4]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.3...3.0.4 diff --git a/discordai_modelizer/command_line/command_line.py b/discordai_modelizer/command_line/command_line.py index 9cf20f8..eb65c1a 100644 --- a/discordai_modelizer/command_line/command_line.py +++ b/discordai_modelizer/command_line/command_line.py @@ -47,7 +47,7 @@ def read_modelizer_args(args, model_subcommand, job_subcommand): thought_min=args.thought_min, max_entry_count=args.max_entries, offset=args.offset, - select_mode=args.select_mode, + distributed=args.distributed, reverse=args.reverse, base_model=args.base_model, clean=args.dirty, diff --git a/discordai_modelizer/command_line/subparsers.py b/discordai_modelizer/command_line/subparsers.py index 27d013d..2791cc2 100644 --- a/discordai_modelizer/command_line/subparsers.py +++ b/discordai_modelizer/command_line/subparsers.py @@ -131,13 +131,11 @@ def setup_model_create(model_subcommand, is_parent=False): help="The offset by line index starting at 0 for where to start selecting lines for the dataset: DEFAULT=0", ) model_create_optional_named.add_argument( - "-s", - "--select-mode", - choices=["sequential", "distributed"], - default="sequential", + "--distributed", + action="store_true", required=False, - dest="select_mode", - help="The method to select lines for the dataset, where `sequential` mode will select lines in chronological order, while `distributed` mode will select an even distribution of lines: DEFAULT=sequential", + dest="distributed", + help="Select lines as an even distribution instead of sequentially", ) model_create_optional_named.add_argument( "--reverse_lines", diff --git a/discordai_modelizer/customize.py b/discordai_modelizer/customize.py index 4579039..e2b2e1d 100644 --- a/discordai_modelizer/customize.py +++ b/discordai_modelizer/customize.py @@ -23,7 +23,7 @@ def create_model( thought_min=4, max_entry_count=1000, offset=0, - select_mode="sequential", + distributed=False, base_model="none", reverse=False, clean=False, @@ -99,7 +99,7 @@ def create_model( except UserNotFoundError as e: print(f"ERROR: {e}") return - get_lines(full_dataset_path, max_entry_count, offset, select_mode, reverse) + get_lines(full_dataset_path, max_entry_count, offset, distributed, reverse) if not clean: print(f"INFO: Dataset saved to {full_dataset_path}") diff --git a/discordai_modelizer/gen_dataset.py b/discordai_modelizer/gen_dataset.py index 8d93ebd..a71265e 100644 --- a/discordai_modelizer/gen_dataset.py +++ b/discordai_modelizer/gen_dataset.py @@ -123,19 +123,17 @@ def add_to_dataset(thought: str): ) -def get_lines( - file_name: str, N=1000, offset=0, select_mode="sequential", reverse=False -): +def get_lines(file_name: str, N=1000, offset=0, distributed=False, reverse=False): with open(file_name, "r") as f: lines = f.readlines() f.close() num_lines = len(lines) - if select_mode == "sequential": - step = 1 + if distributed: + step = (num_lines - offset) // N else: - step = num_lines // N + step = 1 if reverse: lines = lines[::-1] diff --git a/discordai_modelizer/version.py b/discordai_modelizer/version.py index c11769e..35c154a 100644 --- a/discordai_modelizer/version.py +++ b/discordai_modelizer/version.py @@ -1 +1 @@ -__version__ = "3.0.7" +__version__ = "3.0.8" diff --git a/tests/expected_values.py b/tests/expected_values.py index 23a06e1..9a0b9eb 100644 --- a/tests/expected_values.py +++ b/tests/expected_values.py @@ -23,8 +23,8 @@ {"id": "gpt-3.5-turbo", "created": "2023-02-28 18:56:42"}, {"id": "gpt-3.5-turbo-instruct", "created": "2023-08-24 18:23:47"}, {"id": "gpt-3.5-turbo-instruct-0914", "created": "2023-09-07 21:34:32"}, - {"id": "text-embedding-ada-002", "created": "2022-12-16 19:01:39"}, {"id": "gpt-4o", "created": "2024-05-10 18:50:49"}, + {"id": "text-embedding-ada-002", "created": "2022-12-16 19:01:39"}, {"id": "davinci-002", "created": "2023-08-21 16:11:41"}, { "id": "davinci:ft-personal:jason-9582-2022-12-23-05-45-51", @@ -194,6 +194,18 @@ "id": "ft:davinci-002:personal:1663-wardellstephe:9jgCpXW5", "created": "2024-07-11 05:00:27", }, + { + "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGWoJw:ckpt-step-769", + "created": "2024-07-11 23:13:25", + }, + { + "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGXXBT:ckpt-step-1538", + "created": "2024-07-11 23:13:25", + }, + { + "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS", + "created": "2024-07-11 23:13:25", + }, ] list_module_expected_full = [ @@ -341,18 +353,18 @@ "object": "model", "owned_by": "system", }, - { - "id": "text-embedding-ada-002", - "created": "2022-12-16 19:01:39", - "object": "model", - "owned_by": "openai-internal", - }, { "id": "gpt-4o", "created": "2024-05-10 18:50:49", "object": "model", "owned_by": "system", }, + { + "id": "text-embedding-ada-002", + "created": "2022-12-16 19:01:39", + "object": "model", + "owned_by": "openai-internal", + }, { "id": "davinci-002", "created": "2023-08-21 16:11:41", @@ -611,9 +623,34 @@ "object": "model", "owned_by": "user-ygljrfw1bneby79ndybpnodl", }, + { + "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGWoJw:ckpt-step-769", + "created": "2024-07-11 23:13:25", + "object": "model", + "owned_by": "user-ygljrfw1bneby79ndybpnodl", + }, + { + "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGXXBT:ckpt-step-1538", + "created": "2024-07-11 23:13:25", + "object": "model", + "owned_by": "user-ygljrfw1bneby79ndybpnodl", + }, + { + "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS", + "created": "2024-07-11 23:13:25", + "object": "model", + "owned_by": "user-ygljrfw1bneby79ndybpnodl", + }, ] list_job_expected = [ + { + "id": "ftjob-6DFW30DOSQAZaVQ4n3fZ9U4w", + "model": "davinci-002", + "status": "succeeded", + "created_at": "2024-07-11 23:01:08", + "finished_at": "2024-07-11 23:13:23", + }, { "id": "ftjob-PJ6OUUoontOdeUlGmNDBvnfx", "model": "davinci-002", @@ -747,16 +784,33 @@ "created_at": "2024-06-23 22:46:25", "finished_at": "2024-06-23 22:49:44", }, - { - "id": "ftjob-WMnwxBtBuJQgiX3ZRnuXlTF3", - "model": "gpt-3.5-turbo-0125", - "status": "succeeded", - "created_at": "2024-06-23 20:31:21", - "finished_at": "2024-06-23 21:16:09", - }, ] list_job_expected_full = [ + { + "id": "ftjob-6DFW30DOSQAZaVQ4n3fZ9U4w", + "created_at": "2024-07-11 23:01:08", + "error": {"code": None, "message": None, "param": None}, + "fine_tuned_model": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS", + "finished_at": "2024-07-11 23:13:23", + "hyperparameters": { + "n_epochs": 2, + "batch_size": 13, + "learning_rate_multiplier": 16, + }, + "model": "davinci-002", + "object": "fine_tuning.job", + "organization_id": "org-UN3Y7fSoTFPVCWR6f4AMWsVf", + "result_files": ["file-dWjaHOWq5CWsJFUrAWJrZYH0"], + "seed": 706919378, + "status": "succeeded", + "trained_tokens": 348890, + "training_file": "file-9x6n6frPWT8mAx7BeuV0MNQM", + "validation_file": None, + "estimated_finish": None, + "integrations": [], + "user_provided_suffix": "wardellstephe_1663", + }, { "id": "ftjob-PJ6OUUoontOdeUlGmNDBvnfx", "created_at": "2024-07-11 04:48:11", @@ -1261,30 +1315,6 @@ "integrations": [], "user_provided_suffix": "1061_adibiswat", }, - { - "id": "ftjob-WMnwxBtBuJQgiX3ZRnuXlTF3", - "created_at": "2024-06-23 20:31:21", - "error": {"code": None, "message": None, "param": None}, - "fine_tuned_model": "ft:gpt-3.5-turbo-0125:personal:1663-adibiswat:9dOrDhxp", - "finished_at": "2024-06-23 21:16:09", - "hyperparameters": { - "n_epochs": 3, - "batch_size": 1, - "learning_rate_multiplier": 2, - }, - "model": "gpt-3.5-turbo-0125", - "object": "fine_tuning.job", - "organization_id": "org-UN3Y7fSoTFPVCWR6f4AMWsVf", - "result_files": ["file-8ya7wn36JZz6cc9tEy3diyE3"], - "seed": 1834725185, - "status": "succeeded", - "trained_tokens": 76449, - "training_file": "file-RhHsfdxIq0v1AnstxhULFSvO", - "validation_file": None, - "estimated_finish": None, - "integrations": [], - "user_provided_suffix": "1663_adibiswat", - }, ] job_info_expected = { @@ -2160,3 +2190,27 @@ {"prompt": "adibiswat says:", "completion": " message."}, {"prompt": "adibiswat says:", "completion": " message."}, ] + +gen_dataset_max_5_distributed_offset_2 = [ + {"prompt": "adibiswat says:", "completion": " a."}, + {"prompt": "adibiswat says:", "completion": " ai tests."}, + {"prompt": "adibiswat says:", "completion": " pretermined messages."}, + {"prompt": "adibiswat says:", "completion": " another."}, + {"prompt": "adibiswat says:", "completion": " also."}, +] + +gen_dataset_max_5_distributed_reverse = [ + {"prompt": "adibiswat says:", "completion": " the last message."}, + {"prompt": "adibiswat says:", "completion": " message."}, + {"prompt": "adibiswat says:", "completion": " message."}, + {"prompt": "adibiswat says:", "completion": " pretermined messages."}, + {"prompt": "adibiswat says:", "completion": " discord."}, +] + +gen_dataset_max_5_distributed_reverse_offset_2 = [ + {"prompt": "adibiswat says:", "completion": " message."}, + {"prompt": "adibiswat says:", "completion": " a."}, + {"prompt": "adibiswat says:", "completion": " message."}, + {"prompt": "adibiswat says:", "completion": " for the test data."}, + {"prompt": "adibiswat says:", "completion": " i."}, +] diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 1f1233e..c773afa 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -145,7 +145,7 @@ def test_gen_dataset_distributed(default_file_output): gen_dataset.parse_logs( FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1 ) - gen_dataset.get_lines(FULL_DATASET_PATH, N=5, select_mode="distributed") + gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True) with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file: list_dict_comp( expected_values.gen_dataset_max_5_distributed, @@ -154,6 +154,47 @@ def test_gen_dataset_distributed(default_file_output): data_file.close() +def test_gen_dataset_distributed_offset(default_file_output): + gen_dataset.parse_logs( + FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1 + ) + gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True, offset=2) + with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file: + list_dict_comp( + expected_values.gen_dataset_max_5_distributed_offset_2, + [loads(line) for line in data_file], + ) + data_file.close() + + +def test_gen_dataset_distributed_reverse(default_file_output): + gen_dataset.parse_logs( + FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1 + ) + gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True, reverse=True) + with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file: + list_dict_comp( + expected_values.gen_dataset_max_5_distributed_reverse, + [loads(line) for line in data_file], + ) + data_file.close() + + +def test_gen_dataset_distributed_reverse_offset(default_file_output): + gen_dataset.parse_logs( + FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1 + ) + gen_dataset.get_lines( + FULL_DATASET_PATH, N=5, distributed=True, reverse=True, offset=2 + ) + with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file: + list_dict_comp( + expected_values.gen_dataset_max_5_distributed_reverse_offset_2, + [loads(line) for line in data_file], + ) + data_file.close() + + def test_parse_logs_user_not_found(default_file_output): username = "bad_username" with raises(