Merge pull request #54 from A-Baji/dev

3.0.8
A-Baji · Jul 12, 2024 · 0e7def1 · 0e7def1
2 parents bcb93ba + c0a9eae
commit 0e7def1
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) convention.
 
+## [3.0.8] - 07-11-2024
+
+### Changed
+
+- changed distributed selection mode to a flag: `--distributed`
+- fix a small bug for distributed mode with offsets  
+
 ## [3.0.7] - 07-11-2024
 
 ### Changed
@@ -133,6 +140,8 @@ Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and
 
 - switched to `pathlib` for file path parsing
 
+[3.0.8]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.7...3.0.8
+[3.0.7]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.6...3.0.7
 [3.0.6]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.5...3.0.6
 [3.0.5]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.4...3.0.5
 [3.0.4]: https://github.com/A-Baji/discordAI-modelizer/compare/3.0.3...3.0.4

diff --git a/discordai_modelizer/command_line/command_line.py b/discordai_modelizer/command_line/command_line.py
@@ -47,7 +47,7 @@ def read_modelizer_args(args, model_subcommand, job_subcommand):
                 thought_min=args.thought_min,
                 max_entry_count=args.max_entries,
                 offset=args.offset,
-                select_mode=args.select_mode,
+                distributed=args.distributed,
                 reverse=args.reverse,
                 base_model=args.base_model,
                 clean=args.dirty,

diff --git a/discordai_modelizer/command_line/subparsers.py b/discordai_modelizer/command_line/subparsers.py
@@ -131,13 +131,11 @@ def setup_model_create(model_subcommand, is_parent=False):
         help="The offset by line index starting at 0 for where to start selecting lines for the dataset: DEFAULT=0",
     )
     model_create_optional_named.add_argument(
-        "-s",
-        "--select-mode",
-        choices=["sequential", "distributed"],
-        default="sequential",
+        "--distributed",
+        action="store_true",
         required=False,
-        dest="select_mode",
-        help="The method to select lines for the dataset, where `sequential` mode will select lines in chronological order, while `distributed` mode will select an even distribution of lines: DEFAULT=sequential",
+        dest="distributed",
+        help="Select lines as an even distribution instead of sequentially",
     )
     model_create_optional_named.add_argument(
         "--reverse_lines",

diff --git a/discordai_modelizer/customize.py b/discordai_modelizer/customize.py
@@ -23,7 +23,7 @@ def create_model(
     thought_min=4,
     max_entry_count=1000,
     offset=0,
-    select_mode="sequential",
+    distributed=False,
     base_model="none",
     reverse=False,
     clean=False,
@@ -99,7 +99,7 @@ def create_model(
         except UserNotFoundError as e:
             print(f"ERROR: {e}")
             return
-        get_lines(full_dataset_path, max_entry_count, offset, select_mode, reverse)
+        get_lines(full_dataset_path, max_entry_count, offset, distributed, reverse)
         if not clean:
             print(f"INFO: Dataset saved to {full_dataset_path}")
 

diff --git a/discordai_modelizer/gen_dataset.py b/discordai_modelizer/gen_dataset.py
@@ -123,19 +123,17 @@ def add_to_dataset(thought: str):
         )
 
 
-def get_lines(
-    file_name: str, N=1000, offset=0, select_mode="sequential", reverse=False
-):
+def get_lines(file_name: str, N=1000, offset=0, distributed=False, reverse=False):
     with open(file_name, "r") as f:
         lines = f.readlines()
     f.close()
 
     num_lines = len(lines)
 
-    if select_mode == "sequential":
-        step = 1
+    if distributed:
+        step = (num_lines - offset) // N
     else:
-        step = num_lines // N
+        step = 1
 
     if reverse:
         lines = lines[::-1]

diff --git a/discordai_modelizer/version.py b/discordai_modelizer/version.py
@@ -1 +1 @@
-__version__ = "3.0.7"
+__version__ = "3.0.8"
diff --git a/tests/expected_values.py b/tests/expected_values.py
@@ -23,8 +23,8 @@
     {"id": "gpt-3.5-turbo", "created": "2023-02-28 18:56:42"},
     {"id": "gpt-3.5-turbo-instruct", "created": "2023-08-24 18:23:47"},
     {"id": "gpt-3.5-turbo-instruct-0914", "created": "2023-09-07 21:34:32"},
-    {"id": "text-embedding-ada-002", "created": "2022-12-16 19:01:39"},
     {"id": "gpt-4o", "created": "2024-05-10 18:50:49"},
+    {"id": "text-embedding-ada-002", "created": "2022-12-16 19:01:39"},
     {"id": "davinci-002", "created": "2023-08-21 16:11:41"},
     {
         "id": "davinci:ft-personal:jason-9582-2022-12-23-05-45-51",
@@ -194,6 +194,18 @@
         "id": "ft:davinci-002:personal:1663-wardellstephe:9jgCpXW5",
         "created": "2024-07-11 05:00:27",
     },
+    {
+        "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGWoJw:ckpt-step-769",
+        "created": "2024-07-11 23:13:25",
+    },
+    {
+        "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGXXBT:ckpt-step-1538",
+        "created": "2024-07-11 23:13:25",
+    },
+    {
+        "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS",
+        "created": "2024-07-11 23:13:25",
+    },
 ]
 
 list_module_expected_full = [
@@ -341,18 +353,18 @@
         "object": "model",
         "owned_by": "system",
     },
-    {
-        "id": "text-embedding-ada-002",
-        "created": "2022-12-16 19:01:39",
-        "object": "model",
-        "owned_by": "openai-internal",
-    },
     {
         "id": "gpt-4o",
         "created": "2024-05-10 18:50:49",
         "object": "model",
         "owned_by": "system",
     },
+    {
+        "id": "text-embedding-ada-002",
+        "created": "2022-12-16 19:01:39",
+        "object": "model",
+        "owned_by": "openai-internal",
+    },
     {
         "id": "davinci-002",
         "created": "2023-08-21 16:11:41",
@@ -611,9 +623,34 @@
         "object": "model",
         "owned_by": "user-ygljrfw1bneby79ndybpnodl",
     },
+    {
+        "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGWoJw:ckpt-step-769",
+        "created": "2024-07-11 23:13:25",
+        "object": "model",
+        "owned_by": "user-ygljrfw1bneby79ndybpnodl",
+    },
+    {
+        "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGXXBT:ckpt-step-1538",
+        "created": "2024-07-11 23:13:25",
+        "object": "model",
+        "owned_by": "user-ygljrfw1bneby79ndybpnodl",
+    },
+    {
+        "id": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS",
+        "created": "2024-07-11 23:13:25",
+        "object": "model",
+        "owned_by": "user-ygljrfw1bneby79ndybpnodl",
+    },
 ]
 
 list_job_expected = [
+    {
+        "id": "ftjob-6DFW30DOSQAZaVQ4n3fZ9U4w",
+        "model": "davinci-002",
+        "status": "succeeded",
+        "created_at": "2024-07-11 23:01:08",
+        "finished_at": "2024-07-11 23:13:23",
+    },
     {
         "id": "ftjob-PJ6OUUoontOdeUlGmNDBvnfx",
         "model": "davinci-002",
@@ -747,16 +784,33 @@
         "created_at": "2024-06-23 22:46:25",
         "finished_at": "2024-06-23 22:49:44",
     },
-    {
-        "id": "ftjob-WMnwxBtBuJQgiX3ZRnuXlTF3",
-        "model": "gpt-3.5-turbo-0125",
-        "status": "succeeded",
-        "created_at": "2024-06-23 20:31:21",
-        "finished_at": "2024-06-23 21:16:09",
-    },
 ]
 
 list_job_expected_full = [
+    {
+        "id": "ftjob-6DFW30DOSQAZaVQ4n3fZ9U4w",
+        "created_at": "2024-07-11 23:01:08",
+        "error": {"code": None, "message": None, "param": None},
+        "fine_tuned_model": "ft:davinci-002:personal:wardellstephe-1663:9jxGX7BS",
+        "finished_at": "2024-07-11 23:13:23",
+        "hyperparameters": {
+            "n_epochs": 2,
+            "batch_size": 13,
+            "learning_rate_multiplier": 16,
+        },
+        "model": "davinci-002",
+        "object": "fine_tuning.job",
+        "organization_id": "org-UN3Y7fSoTFPVCWR6f4AMWsVf",
+        "result_files": ["file-dWjaHOWq5CWsJFUrAWJrZYH0"],
+        "seed": 706919378,
+        "status": "succeeded",
+        "trained_tokens": 348890,
+        "training_file": "file-9x6n6frPWT8mAx7BeuV0MNQM",
+        "validation_file": None,
+        "estimated_finish": None,
+        "integrations": [],
+        "user_provided_suffix": "wardellstephe_1663",
+    },
     {
         "id": "ftjob-PJ6OUUoontOdeUlGmNDBvnfx",
         "created_at": "2024-07-11 04:48:11",
@@ -1261,30 +1315,6 @@
         "integrations": [],
         "user_provided_suffix": "1061_adibiswat",
     },
-    {
-        "id": "ftjob-WMnwxBtBuJQgiX3ZRnuXlTF3",
-        "created_at": "2024-06-23 20:31:21",
-        "error": {"code": None, "message": None, "param": None},
-        "fine_tuned_model": "ft:gpt-3.5-turbo-0125:personal:1663-adibiswat:9dOrDhxp",
-        "finished_at": "2024-06-23 21:16:09",
-        "hyperparameters": {
-            "n_epochs": 3,
-            "batch_size": 1,
-            "learning_rate_multiplier": 2,
-        },
-        "model": "gpt-3.5-turbo-0125",
-        "object": "fine_tuning.job",
-        "organization_id": "org-UN3Y7fSoTFPVCWR6f4AMWsVf",
-        "result_files": ["file-8ya7wn36JZz6cc9tEy3diyE3"],
-        "seed": 1834725185,
-        "status": "succeeded",
-        "trained_tokens": 76449,
-        "training_file": "file-RhHsfdxIq0v1AnstxhULFSvO",
-        "validation_file": None,
-        "estimated_finish": None,
-        "integrations": [],
-        "user_provided_suffix": "1663_adibiswat",
-    },
 ]
 
 job_info_expected = {
@@ -2160,3 +2190,27 @@
     {"prompt": "adibiswat says:", "completion": " message."},
     {"prompt": "adibiswat says:", "completion": " message."},
 ]
+
+gen_dataset_max_5_distributed_offset_2 = [
+    {"prompt": "adibiswat says:", "completion": " a."},
+    {"prompt": "adibiswat says:", "completion": " ai tests."},
+    {"prompt": "adibiswat says:", "completion": " pretermined messages."},
+    {"prompt": "adibiswat says:", "completion": " another."},
+    {"prompt": "adibiswat says:", "completion": " also."},
+]
+
+gen_dataset_max_5_distributed_reverse = [
+    {"prompt": "adibiswat says:", "completion": " the last message."},
+    {"prompt": "adibiswat says:", "completion": " message."},
+    {"prompt": "adibiswat says:", "completion": " message."},
+    {"prompt": "adibiswat says:", "completion": " pretermined messages."},
+    {"prompt": "adibiswat says:", "completion": " discord."},
+]
+
+gen_dataset_max_5_distributed_reverse_offset_2 = [
+    {"prompt": "adibiswat says:", "completion": " message."},
+    {"prompt": "adibiswat says:", "completion": " a."},
+    {"prompt": "adibiswat says:", "completion": " message."},
+    {"prompt": "adibiswat says:", "completion": " for the test data."},
+    {"prompt": "adibiswat says:", "completion": " i."},
+]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -145,7 +145,7 @@ def test_gen_dataset_distributed(default_file_output):
     gen_dataset.parse_logs(
         FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
     )
-    gen_dataset.get_lines(FULL_DATASET_PATH, N=5, select_mode="distributed")
+    gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True)
     with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
         list_dict_comp(
             expected_values.gen_dataset_max_5_distributed,
@@ -154,6 +154,47 @@ def test_gen_dataset_distributed(default_file_output):
         data_file.close()
 
 
+def test_gen_dataset_distributed_offset(default_file_output):
+    gen_dataset.parse_logs(
+        FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
+    )
+    gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True, offset=2)
+    with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
+        list_dict_comp(
+            expected_values.gen_dataset_max_5_distributed_offset_2,
+            [loads(line) for line in data_file],
+        )
+        data_file.close()
+
+
+def test_gen_dataset_distributed_reverse(default_file_output):
+    gen_dataset.parse_logs(
+        FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
+    )
+    gen_dataset.get_lines(FULL_DATASET_PATH, N=5, distributed=True, reverse=True)
+    with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
+        list_dict_comp(
+            expected_values.gen_dataset_max_5_distributed_reverse,
+            [loads(line) for line in data_file],
+        )
+        data_file.close()
+
+
+def test_gen_dataset_distributed_reverse_offset(default_file_output):
+    gen_dataset.parse_logs(
+        FULL_LOGS_PATH, CHANNEL_ID, USER, thought_time=0, thought_min=1
+    )
+    gen_dataset.get_lines(
+        FULL_DATASET_PATH, N=5, distributed=True, reverse=True, offset=2
+    )
+    with open(FULL_DATASET_PATH, "r", encoding="utf-8") as data_file:
+        list_dict_comp(
+            expected_values.gen_dataset_max_5_distributed_reverse_offset_2,
+            [loads(line) for line in data_file],
+        )
+        data_file.close()
+
+
 def test_parse_logs_user_not_found(default_file_output):
     username = "bad_username"
     with raises(