Skip to content

Commit

Permalink
Updated model to falcon (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
yonishelach authored Jul 12, 2023
1 parent 91145f9 commit 52809a1
Show file tree
Hide file tree
Showing 10 changed files with 476 additions and 368 deletions.
54 changes: 33 additions & 21 deletions project.yaml

Large diffs are not rendered by default.

96 changes: 96 additions & 0 deletions project_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import importlib

import mlrun


def assert_build():
for module_name in [
"torch",
"transformers",
"datasets",
"accelerate",
"evaluate",
"deepspeed",
"mpi4py",
]:
module = importlib.import_module(module_name)
print(module.__version__)


def setup(
project: mlrun.projects.MlrunProject
):
"""
Creating the project for this demo.
:returns: a fully prepared project for this demo.
"""
print(project.get_param("source"))
# Set or build the default image:
if project.get_param("default_image") is None:
print("Building image for the demo:")
image_builder = project.set_function(
"project_setup.py",
name="image-builder",
handler="assert_build",
kind="job",
image="mlrun/ml-models-gpu",
requirements=[
"torch",
"transformers[deepspeed]",
"datasets",
"accelerate",
"evaluate",
"mpi4py",
],
)
assert image_builder.deploy()
default_image = image_builder.spec.image
project.set_default_image(project.get_param("default_image"))

# Set the project git source:

project.set_source(project.get_param("source"), pull_at_runtime=True)

# Set the data collection function:
data_collection_function = project.set_function(
"src/data_collection.py",
name="data-collecting",
image="mlrun/mlrun",
kind="job",

)
data_collection_function.apply(mlrun.auto_mount())
data_collection_function.save()

# Set the data preprocessing function:
project.set_function(
"src/data_preprocess.py",
name="data-preparing",
kind="job",
)

# Set the training function:
train_function = project.set_function(
"src/trainer.py",
name="training",
kind="job",
)
train_function.with_limits(
gpus=project.get_param("num_gpus_per_replica") or 4,
cpu=project.get_param("num_cpus_per_replica") or 48,
mem=project.get_param("memory_per_replica") or "192Gi",
)
train_function.save()

project.set_function(
"src/serving.py",
name="serving",
kind="serving",
)

# Set the training workflow:
project.set_workflow("training_workflow", "src/training_workflow.py")

# Save and return the project:
project.save()
return project
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ transformers
datasets
accelerate
evaluate
bs4
bs4
einops
xformers
4 changes: 1 addition & 3 deletions src/data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
from pathlib import Path
from urllib.request import Request, urlopen

import mlrun
from bs4 import BeautifulSoup, Tag

ARTICLE_TOKEN = "Article: "
HEADER_TOKEN = "Subject: "
HEADER_TOKEN = "### Human: "


def normalize(s: str) -> str:
Expand Down Expand Up @@ -57,7 +56,6 @@ def get_html_as_string(url: str, mark_headers: bool) -> str:
return soup.get_text()


@mlrun.handler(outputs=["html-as-text-files:directory"])
def collect_html_to_text_files(urls_file: str, mark_headers=True) -> str:
"""
Retrieve all html text content from URLs as text files.
Expand Down
18 changes: 6 additions & 12 deletions src/data_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import json
import os
import tempfile
import zipfile
from pathlib import Path

import mlrun
from datasets import load_dataset

ARTICLE_TOKEN = "Article: "
HEADER_TOKEN = "Subject: "
CONTENT_TOKEN = "Content: "
DATA_FORMAT = """Subject: {} {}
Content: {}"""
HEADER_TOKEN = "### Human: "
CONTENT_TOKEN = "### Assistant: "

DATA_FORMAT = """### Human: {} {}
### Assistant: {}"""
END_OF_ARTICLE = "Latest Posts"


Expand Down Expand Up @@ -62,7 +61,6 @@ def convert_textfile_to_data_with_prompts(txt_file: Path):
return data


@mlrun.handler(outputs=["html-data:dataset"])
def prepare_dataset(source_dir: str):
"""
Build the dataset from text files as a 'text: prompt' structure.
Expand All @@ -71,11 +69,7 @@ def prepare_dataset(source_dir: str):
:returns: A dataset with all the prompts inside
"""
with zipfile.ZipFile(source_dir, "r") as zip_file:
tmp_dir = tempfile.mkdtemp()
zip_file.extractall(tmp_dir)

path_list = Path(tmp_dir).glob("./*.txt")
path_list = Path(source_dir).glob("./*.txt")
data = []
# Converting text files into data in our prompt format:
for path in path_list:
Expand Down
122 changes: 0 additions & 122 deletions src/project_setup.py

This file was deleted.

Loading

0 comments on commit 52809a1

Please sign in to comment.