Updated model to falcon (#8)

mlrun · Jul 12, 2023 · 52809a1 · 52809a1
1 parent 91145f9
commit 52809a1
Show file tree

Hide file tree

Showing 10 changed files with 476 additions and 368 deletions.
diff --git a/project.yaml b/project.yaml
diff --git a/project_setup.py b/project_setup.py
@@ -0,0 +1,96 @@
+import importlib
+
+import mlrun
+
+
+def assert_build():
+    for module_name in [
+        "torch",
+        "transformers",
+        "datasets",
+        "accelerate",
+        "evaluate",
+        "deepspeed",
+        "mpi4py",
+    ]:
+        module = importlib.import_module(module_name)
+        print(module.__version__)
+
+
+def setup(
+        project: mlrun.projects.MlrunProject
+):
+    """
+    Creating the project for this demo.
+    :returns: a fully prepared project for this demo.
+    """
+    print(project.get_param("source"))
+    # Set or build the default image:
+    if project.get_param("default_image") is None:
+        print("Building image for the demo:")
+        image_builder = project.set_function(
+            "project_setup.py",
+            name="image-builder",
+            handler="assert_build",
+            kind="job",
+            image="mlrun/ml-models-gpu",
+            requirements=[
+                "torch",
+                "transformers[deepspeed]",
+                "datasets",
+                "accelerate",
+                "evaluate",
+                "mpi4py",
+            ],
+        )
+        assert image_builder.deploy()
+        default_image = image_builder.spec.image
+    project.set_default_image(project.get_param("default_image"))
+
+    # Set the project git source:
+
+    project.set_source(project.get_param("source"), pull_at_runtime=True)
+
+    # Set the data collection function:
+    data_collection_function = project.set_function(
+        "src/data_collection.py",
+        name="data-collecting",
+        image="mlrun/mlrun",
+        kind="job",
+
+    )
+    data_collection_function.apply(mlrun.auto_mount())
+    data_collection_function.save()
+
+    # Set the data preprocessing function:
+    project.set_function(
+        "src/data_preprocess.py",
+        name="data-preparing",
+        kind="job",
+    )
+
+    # Set the training function:
+    train_function = project.set_function(
+        "src/trainer.py",
+        name="training",
+        kind="job",
+    )
+    train_function.with_limits(
+        gpus=project.get_param("num_gpus_per_replica") or 4,
+        cpu=project.get_param("num_cpus_per_replica") or 48,
+        mem=project.get_param("memory_per_replica") or "192Gi",
+    )
+    train_function.save()
+
+    project.set_function(
+        "src/serving.py",
+        name="serving",
+        kind="serving",
+    )
+
+    # Set the training workflow:
+    project.set_workflow("training_workflow", "src/training_workflow.py")
+
+    # Save and return the project:
+    project.save()
+    return project
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,6 @@ transformers
 datasets
 accelerate
 evaluate
-bs4
+bs4
+einops
+xformers
diff --git a/src/data_collection.py b/src/data_collection.py
@@ -3,11 +3,10 @@
 from pathlib import Path
 from urllib.request import Request, urlopen
 
-import mlrun
 from bs4 import BeautifulSoup, Tag
 
 ARTICLE_TOKEN = "Article: "
-HEADER_TOKEN = "Subject: "
+HEADER_TOKEN = "### Human: "
 
 
 def normalize(s: str) -> str:
@@ -57,7 +56,6 @@ def get_html_as_string(url: str, mark_headers: bool) -> str:
     return soup.get_text()
 
 
-@mlrun.handler(outputs=["html-as-text-files:directory"])
 def collect_html_to_text_files(urls_file: str, mark_headers=True) -> str:
     """
     Retrieve all html text content from URLs as text files.

diff --git a/src/data_preprocess.py b/src/data_preprocess.py
@@ -1,17 +1,16 @@
 import json
 import os
 import tempfile
-import zipfile
 from pathlib import Path
 
-import mlrun
 from datasets import load_dataset
 
 ARTICLE_TOKEN = "Article: "
-HEADER_TOKEN = "Subject: "
-CONTENT_TOKEN = "Content: "
-DATA_FORMAT = """Subject: {} {}
-Content: {}"""
+HEADER_TOKEN = "### Human: "
+CONTENT_TOKEN = "### Assistant: "
+
+DATA_FORMAT = """### Human: {} {}
+### Assistant: {}"""
 END_OF_ARTICLE = "Latest Posts"
 
 
@@ -62,7 +61,6 @@ def convert_textfile_to_data_with_prompts(txt_file: Path):
     return data
 
 
-@mlrun.handler(outputs=["html-data:dataset"])
 def prepare_dataset(source_dir: str):
     """
     Build the dataset from text files as a 'text: prompt' structure.
@@ -71,11 +69,7 @@ def prepare_dataset(source_dir: str):
 
     :returns: A dataset with all the prompts inside
     """
-    with zipfile.ZipFile(source_dir, "r") as zip_file:
-        tmp_dir = tempfile.mkdtemp()
-        zip_file.extractall(tmp_dir)
-
-    path_list = Path(tmp_dir).glob("./*.txt")
+    path_list = Path(source_dir).glob("./*.txt")
     data = []
     # Converting text files into data in our prompt format:
     for path in path_list:

diff --git a/src/project_setup.py b/src/project_setup.py