From 82b7a715e38e39aff1599dfdadeb9819b585c641 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sat, 4 May 2024 21:12:19 -0400 Subject: [PATCH] Import the training dir from the ai-lab-recipes repository This commit imports a specific commit ref (current `main`) of the `training` directory of the github.com/containers/ai-lab-recipes repository. Updating the contents to a newer commit ref can be done by updating the ref in the `Makefile` and then running `make update-training-dir`. Note there is current a discrepency between the `README.md` and this content. The `README.md` needs updates to reflect that all operations are to be done from within the `training` directory. Signed-off-by: Russell Bryant --- .markdownlint-cli2.yaml | 1 + Makefile | 11 ++ training/Makefile | 113 +++++++++++ training/README.md | 144 ++++++++++++++ training/amd-bootc/Containerfile | 34 ++++ training/amd-bootc/Makefile | 22 +++ training/amd-bootc/rocm.repo | 7 + training/cloud/Makefile | 23 +++ training/common/Makefile.common | 101 ++++++++++ .../system/bootc-generic-growpart.service | 20 ++ .../bootc-generic-growpart.service | 1 + .../common/usr/libexec/bootc-generic-growpart | 41 ++++ training/deepspeed/Containerfile | 17 ++ training/deepspeed/Makefile | 15 ++ training/ilab-wrapper/ilab | 183 ++++++++++++++++++ training/ilab-wrapper/ilab-qlora | 47 +++++ training/ilab-wrapper/ilab-training-launcher | 86 ++++++++ training/instructlab/Makefile | 24 +++ training/intel-bootc/Containerfile | 83 ++++++++ training/intel-bootc/Makefile | 22 +++ training/model/Containerfile | 11 ++ training/model/Makefile | 22 +++ training/nvidia-bootc/Containerfile | 183 ++++++++++++++++++ training/nvidia-bootc/Containerfile.builder | 56 ++++++ training/nvidia-bootc/Makefile | 50 +++++ .../nvidia-toolkit-firstboot.service | 13 ++ training/nvidia-bootc/x509-configuration.ini | 15 ++ training/provision/ansible.cfg | 2 + training/provision/playbook.yml | 77 ++++++++ training/provision/requirements.yml | 4 + training/provision/templates/Containerfile.j2 | 9 + training/vllm/Containerfile | 5 + training/vllm/Makefile | 14 ++ training/vllm/mixtral.jinja | 12 ++ 34 files changed, 1468 insertions(+) create mode 100644 training/Makefile create mode 100644 training/README.md create mode 100644 training/amd-bootc/Containerfile create mode 100644 training/amd-bootc/Makefile create mode 100644 training/amd-bootc/rocm.repo create mode 100644 training/cloud/Makefile create mode 100644 training/common/Makefile.common create mode 100644 training/common/usr/lib/systemd/system/bootc-generic-growpart.service create mode 120000 training/common/usr/lib/systemd/system/local-fs.target.wants/bootc-generic-growpart.service create mode 100755 training/common/usr/libexec/bootc-generic-growpart create mode 100644 training/deepspeed/Containerfile create mode 100644 training/deepspeed/Makefile create mode 100755 training/ilab-wrapper/ilab create mode 100755 training/ilab-wrapper/ilab-qlora create mode 100755 training/ilab-wrapper/ilab-training-launcher create mode 100644 training/instructlab/Makefile create mode 100644 training/intel-bootc/Containerfile create mode 100644 training/intel-bootc/Makefile create mode 100644 training/model/Containerfile create mode 100644 training/model/Makefile create mode 100644 training/nvidia-bootc/Containerfile create mode 100644 training/nvidia-bootc/Containerfile.builder create mode 100644 training/nvidia-bootc/Makefile create mode 100644 training/nvidia-bootc/nvidia-toolkit-firstboot.service create mode 100644 training/nvidia-bootc/x509-configuration.ini create mode 100644 training/provision/ansible.cfg create mode 100644 training/provision/playbook.yml create mode 100644 training/provision/requirements.yml create mode 100644 training/provision/templates/Containerfile.j2 create mode 100644 training/vllm/Containerfile create mode 100644 training/vllm/Makefile create mode 100644 training/vllm/mixtral.jinja diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml index 21a628b..288ae41 100644 --- a/.markdownlint-cli2.yaml +++ b/.markdownlint-cli2.yaml @@ -13,3 +13,4 @@ ignores: - "**/node_modules/**" - ".tox/**" - "venv/**" + - "training/**" diff --git a/Makefile b/Makefile index 182b307..9fa181b 100644 --- a/Makefile +++ b/Makefile @@ -20,3 +20,14 @@ endif md-lint: ## Lint markdown files $(ECHO_PREFIX) printf " %-12s ./...\n" "[MD LINT]" $(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:v0.6.0 > /dev/null + +AI_LAB_REPO:=https://github.com/containers/ai-lab-recipes.git +AI_LAB_REF:=a98479cf572d1f2eb6a70bfbd9ed49f4c12e0c61 +.PHONY: update-training-dir +update-training-dir: ## Update the contents of the training directory + $(ECHO_PREFIX) printf " %-12s $(AI_LAB_RECIPES_REF)\n" "[UPDATE TRAINING DIR]" + $(CMD_PREFIX) [ ! -d ai-lab-recipes ] || rm -rf ai-lab-recipes + $(CMD_PREFIX) git clone ${AI_LAB_REPO} ai-lab-recipes + $(CMD_PREFIX) mkdir -p training + $(CMD_PREFIX) cd ai-lab-recipes && git archive $(AI_LAB_REF) training | tar -x -C ../ + $(CMD_PREFIX) rm -rf ai-lab-recipes diff --git a/training/Makefile b/training/Makefile new file mode 100644 index 0000000..6710838 --- /dev/null +++ b/training/Makefile @@ -0,0 +1,113 @@ +default: help + +help: + @echo "To build a bootable container image you first need to create instructlab container images for a particular vendor " + @echo + @echo " - make instruct-amd" + @echo " - make instruct-intel" + @echo " - make instruct-nvidia" + @echo " - make instruct-vllm" + @echo + @echo "Once instruct images created, create advanced training containers, deepspeed only for nvidia" + @echo + @echo " - make deepspeed" + @echo " - make vllm" + @echo + @echo "Once instruct images are created, create bootc container images" + @echo + @echo " - make amd" + @echo " - make intel" + @echo " - make nvidia" + @echo + @echo "If these images are going to be used on a cloud, you might want to add cloud-init." + @echo + @echo " - make cloud-amd" + @echo " - make cloud-intel" + @echo " - make cloud-nvidia" + @echo " - make cloud-vllm" + @echo + @echo "Make prune. This command will remove all buildah containers if left behind from podman build and then prune all unused container images. Useful if you are running out of space." + @echo + @echo " - make prune" + @echo + @echo "To create a disk image" + @echo + @echo " - make disk-amd" + @echo " - make disk-intel" + @echo " - make disk-nvidia" + +# +# Create instructlab AI container images +# +.PHONY: +instruct-amd: + make -C instructlab amd + +.PHONY: +instruct-nvidia: + make -C instructlab nvidia + +.PHONY: +instruct: instruct-amd instruct-nvidia + +.PHONY: deepspeed +deepspeed: + make -C deepspeed/ image + +.PHONY: vllm +vllm: + make -C vllm/ image + +# +# Create bootc container images prepared for AI +# +.PHONY: amd nvidia intel vllm +amd: + make -C amd-bootc/ bootc +intel: + make -C intel-bootc/ bootc +nvidia: + make -C nvidia-bootc/ dtk bootc + +# +# Make Bootc container images preinstalled with cloud-init +# +.PHONY: +cloud-amd: + make VENDOR=amd -C cloud + +.PHONY: +cloud-intel: + make VENDOR=intel -C cloud + +.PHONY: +cloud-nvidia: + make VENDOR=nvidia -C cloud + +.PHONY: +cloud: cloud-amd cloud-intel cloud + +# +# We often see users running out of space. These commands are useful for freeing wasted space. +# Note becarful to not run this target if a podman build is in progress. +# +.PHONY: prune +prune: + buildah rm --all + podman image prune -f + +# Create disk images with bootc-image-builder +# +.PHONY: disk-amd +disk-amd: + make -C amd-bootc/ bootc-image-builder +.PHONY: disk-intel +disk-intel: + make -C intel-bootc/ bootc-image-builder +.PHONY: disk-nvidia +disk-nvidia: + make -C nvidia-bootc/ bootc-image-builder + +.PHONY: clean +clean: + rm -rf build diff --git a/training/README.md b/training/README.md new file mode 100644 index 0000000..71b7405 --- /dev/null +++ b/training/README.md @@ -0,0 +1,144 @@ +Linux Operating System Bootable containers enabled for AI Training +=== + +In order to run accelerated AI workloads, we've prepared [bootc](https://github.com/containers/bootc) container images for the major AI platforms. + +# Makefile targets + +| Target | Description | +|-----------------|---------------------------------------------------------------------| +| amd | Create bootable container for AMD platform | +| deepspeed | DeepSpeed container for optimization deep learning | +| cloud-amd | Add cloud-init to bootable container for AMD platform | +| cloud-intel | Add cloud-init to bootable container for Intel platform | +| cloud-nvidia | Add cloud-init to bootable container for Nvidia platform | +| disk-amd | Create disk image from bootable container for AMD platform | +| disk-intel | Create disk image from bootable container for Intel platform | +| disk-nvidia | Create disk image from bootable container for Nvidia platform | +| instruct-amd | Create instruct lab image for bootable container for AMD platform | +| instruct-intel | Create instruct lab image for bootable container for Intel platform | +| instruct-nvidia | Create instruct lab image for bootable container for Nvidia platform| +| intel | Create bootable container for Intel Habanalabs platform | +| nvidia | Create bootable container for NVidia platform | +| vllm | Containerized inference/serving engine for LLMs | + +# Makefile variables + +| Variable | Description | Default | +|---------------------------|-------------------------------------------------|---------------------------------------------| +| FROM | Overrides the base image for the Containerfiles | `quay.io/centos-bootc/centos-bootc:stream9` | +| REGISTRY | Container Registry for storing container images | `quay.io` | +| REGISTRY_ORG | Container Registry organization | `ai-lab` | +| IMAGE_NAME | Container image name | platform (i.e. `amd`) | +| IMAGE_TAG | Container image tag | `latest` | +| CONTAINER_TOOL | Container tool used for build | `podman` | +| CONTAINER_TOOL_EXTRA_ARGS | Container tool extra arguments | ` ` | + + +Note: AI content is huge and requires a lot of disk space >200GB free to build. + +# How to build InstructLab containers + +In order to do AI Training you need to build instructlab container images. + +Simply execute `make instruct-`. For example: + +* make instruct-amd +* make instruct-intel +* make instruct-nvidia + +Once you have these container images built it is time to build vllm. + +# How to build the vllm inference engine + +* make vllm + +# On nvidia systems, you need to build the deepspeed container + +* make deepspeed + +# How to build bootc container images + +In order to build the images (by default based on CentOS Stream), a simple `make ` should be enough. For example to build the `nvidia`, `amd` and `intel` bootc containers, respectively: + +``` +make nvidia +make amd +make intel +``` + +## How to build bootc container images based on Red Hat Enterprise Linux + +In order to build the training images based on Red Hat Enterprise Linux bootc images, the appropriate base container image must be used in the `FROM` field and the build process must be run on an *entitled Red Hat 9.x Enterprise Linux* with a valid subscription. + +For example: + +``` +make nvidia FROM=registry.redhat.io/rhel9/rhel-bootc:9.4 +make amd FROM=registry.redhat.io/rhel9/rhel-bootc:9.4 +make intel FROM=registry.redhat.io/rhel9/rhel-bootc:9.4 +``` + +Of course, the other Makefile variables are still available, so the following is a valid build command: + +``` +make nvidia REGISTRY=myregistry.com REGISTRY_ORG=ai-training IMAGE_NAME=nvidia IMAGE_TAG=v1 FROM=registry.redhat.io/rhel9/rhel-bootc:9.4 +``` + +# How to build Cloud ready images + +Bootc container images can be installed on physical machines, virtual machines and in the cloud. Often it is useful to add the cloud-init package when running the operating systems in the cloud. + +To add cloud-init to your existing bootc container image, executing `make cloud-` should be enough. For example to build the `cloud-nvidia`, `cloud-amd` and `cloud-intel` bootc containers, respectively: + +``` +make cloud-nvidia +make cloud-amd +make cloud-intel +``` + +# How to build disk images +bootc-image-builder produces disk images using a bootable container as input. Disk images can be used to directly provision a host +The process will write the disk image in -bootc/build + +To invoke bootc-image-builder, execute make disk- +``` +make disk-nvidia +``` +or +``` +make disk-nvidia DISK_TYPE=ami BOOTC_IMAGE=quay.io/ai-lab/nvidia-bootc-cloud:latest +``` + +In addition to the variables common to all targets, a few extra can be defined to customize disk image creation + +| Variable | Description | Default | +|-----------------------|-----------------------------------|--------------------------------------------------| +| BOOTC_IMAGE | Image to use as input | `$REGISTRY/$REGISTRY_ORG/$IMAGE_NAME:$IMAGE_TAG` | +| DISK_TYPE | Type of image to build | `qcow2` | +| IMAGE_BUILDER_CONFIG | Path to a build-config file | `EMPTY` | + +Image builder config file is documented in [bootc-image-builder README](https://github.com/osbuild/bootc-image-builder?tab=readme-ov-file#-build-config) + +The following image disk types are currently available: +| Disk type | Target environment | +|-----------------------|---------------------------------------------------------------------------------------| +| `ami` | [Amazon Machine Image](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) | +| `qcow2` **(default)** | [QEMU](https://www.qemu.org/) | +| `vmdk` | [VMDK](https://en.wikipedia.org/wiki/VMDK) usable in vSphere, among others | +| `anaconda-iso` | An unattended Anaconda installer that installs to the first disk found. | +| `raw` | Unformatted [raw disk](https://en.wikipedia.org/wiki/Rawdisk). | + +# Troubleshooting + +Sometimes, interrupting the build process may lead to wanting a complete restart of the process. For those cases, we can instruct `podman` to start from scratch and discard the cached layers. This is possible by passing the `--no-cache` parameter to the build process by using the `CONTAINER_TOOL_EXTRA_ARGS` variable: + +``` +make CONTAINER_TOOL_EXTRA_ARGS="--no-cache" +``` + +The building of accelerated images requires a lot of temporary disk space. In case you need to specify a directory for temporary storage, this can be done with the `TMPDIR` environment variable: + +``` +make TMPDIR=/path/to/tmp +``` diff --git a/training/amd-bootc/Containerfile b/training/amd-bootc/Containerfile new file mode 100644 index 0000000..1524dbd --- /dev/null +++ b/training/amd-bootc/Containerfile @@ -0,0 +1,34 @@ +FROM quay.io/centos-bootc/centos-bootc:stream9 + +ADD rocm.repo /etc/yum.repos.d/rocm.repo + +# Include growfs service +COPY build/usr /usr + +ARG EXTRA_RPM_PACKAGES='' +RUN dnf install -y \ + rocm-smi \ + ${EXTRA_RPM_PACKAGES} \ + && dnf clean all + +# Setup /usr/lib/containers/storage as an additional store for images. +# Remove once the base images have this set by default. +RUN sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \ + /etc/containers/storage.conf && \ + cp /run/.input/ilab /usr/local/bin/ilab + +ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest" +ARG VLLM_IMAGE + +RUN sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' /usr/local/bin/ilab +RUN sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' /usr/local/bin/ilab +RUN sed -i "s%__REPLACE_CONTAINER_NAME__%${INSTRUCTLAB_IMAGE}%" /usr/local/bin/ilab + +# Added for running as an OCI Container to prevent Overlay on Overlay issues. +VOLUME /var/lib/containers + +# Prepull the instructlab image +RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/vllm) && \ + podman --root /usr/lib/containers/storage image tag ${IID} ${VLLM_IMAGE} +RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/instructlab-amd) && \ + podman --root /usr/lib/containers/storage image tag ${IID} ${INSTRUCTLAB_IMAGE} diff --git a/training/amd-bootc/Makefile b/training/amd-bootc/Makefile new file mode 100644 index 0000000..a4a6381 --- /dev/null +++ b/training/amd-bootc/Makefile @@ -0,0 +1,22 @@ + +VENDOR ?= amd +IMAGE_NAME ?= $(VENDOR)-bootc + +include ../common/Makefile.common + +default: bootc + +.PHONY: bootc +bootc: prepare-files growfs + "${CONTAINER_TOOL}" build \ + $(ARCH:%=--platform linux/%) \ + --security-opt label=disable \ + --cap-add SYS_ADMIN \ + --file Containerfile \ + -v ${OUTDIR}:/run/.input:ro \ + --tag "${BOOTC_IMAGE}" \ + --build-arg "INSTRUCTLAB_IMAGE=$(INSTRUCTLAB_IMAGE)" \ + --build-arg "VLLM_IMAGE=$(VLLM_IMAGE)" \ + $(EXTRA_RPM_PACKAGES:%=--build-arg EXTRA_RPM_PACKAGES=%) \ + $(FROM:%=--from=%) \ + ${CONTAINER_TOOL_EXTRA_ARGS} . diff --git a/training/amd-bootc/rocm.repo b/training/amd-bootc/rocm.repo new file mode 100644 index 0000000..3b9f9fc --- /dev/null +++ b/training/amd-bootc/rocm.repo @@ -0,0 +1,7 @@ +[ROCm-6.0.2] +name=ROCm6.0.2 +baseurl=https://repo.radeon.com/rocm/rhel$releasever/6.0.2/main +enabled=1 +priority=50 +gpgcheck=1 +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key diff --git a/training/cloud/Makefile b/training/cloud/Makefile new file mode 100644 index 0000000..5c91dad --- /dev/null +++ b/training/cloud/Makefile @@ -0,0 +1,23 @@ +default: cloud + +include ../common/Makefile.common + +REGISTRY ?= quay.io +REGISTRY_ORG ?= ai-lab +IMAGE_TAG ?= latest + +.PHONY: init +init: + git clone https://gitlab.com/bootc-org/examples.git 2> /dev/null || true + (cd examples; git pull origin main) + +.PHONY: cloud +cloud: init + "${CONTAINER_TOOL}" build \ + $(ARCH:%=--platform linux/%) \ + --tag "${REGISTRY}/${REGISTRY_ORG}/${IMAGE_NAME}-cloud:${IMAGE_TAG}" \ + --from="${BOOTC_IMAGE}" \ + examples/cloud-init + +.PHONY: push +push: push-amd push-nvidia diff --git a/training/common/Makefile.common b/training/common/Makefile.common new file mode 100644 index 0000000..8a178d2 --- /dev/null +++ b/training/common/Makefile.common @@ -0,0 +1,101 @@ +FROM ?= +VENDOR ?= + +REGISTRY ?= quay.io +REGISTRY_ORG ?= ai-lab +IMAGE_NAME ?= $(VENDOR)-bootc +IMAGE_TAG ?= latest +BOOTC_IMAGE ?= ${REGISTRY}/${REGISTRY_ORG}/${IMAGE_NAME}:${IMAGE_TAG} + +CONTAINER_TOOL ?= podman +CONTAINER_TOOL_EXTRA_ARGS ?= +EXTRA_RPM_PACKAGES ?= +GRAPH_ROOT=$(shell podman info --format '{{ .Store.GraphRoot }}') +AUTH_JSON ?= + +BOOTC_IMAGE_BUILDER ?= quay.io/centos-bootc/bootc-image-builder +IMAGE_BUILDER_CONFIG ?= +DISK_TYPE ?= qcow2 +DISK_UID ?= $(shell id -u) +DISK_GID ?= $(shell id -g) + +ARCH ?= + +DRIVER_VERSION ?= +KERNEL_VERSION ?= + +INSTRUCTLAB_IMAGE = $(REGISTRY)/$(REGISTRY_ORG)/instructlab-$(VENDOR):$(IMAGE_TAG) +VLLM_IMAGE = $(REGISTRY)/$(REGISTRY_ORG)/vllm:$(IMAGE_TAG) +TRAIN_IMAGE = $(REGISTRY)/$(REGISTRY_ORG)/deepspeed-trainer:$(IMAGE_TAG) +INSTRUCTLAB_IMAGE_ID = $(shell $(CONTAINER_TOOL) image inspect $(INSTRUCTLAB_IMAGE) --format {{.Id}}) +VLLM_IMAGE_ID = $(shell $(CONTAINER_TOOL) image inspect $(VLLM_IMAGE) --format {{.Id}}) +TRAIN_IMAGE_ID = $(shell $(CONTAINER_TOOL) image inspect $(TRAIN_IMAGE) --format {{.Id}}) +WRAPPER = $(CURDIR)/../ilab-wrapper/ilab +QLORA_WRAPPER = $(CURDIR)/../ilab-wrapper/ilab-qlora +TRAIN_WRAPPER = $(CURDIR)/../ilab-wrapper/ilab-training-launcher +OUTDIR = $(CURDIR)/../build + +SSH_PUBKEY ?= $(shell cat ${HOME}/.ssh/id_rsa.pub 2> /dev/null) + +.PHONY: prepare-files +prepare-files: $(OUTDIR)/$(WRAPPER) $(OUTDIR)/$(QLORA_WRAPPER) $(OUTDIR)/$(TRAIN_WRAPPER) $(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID) $(OUTDIR)/$(VLLM_IMAGE_ID) $(OUTDIR)/$(TRAIN_IMAGE_ID) + +$(OUTDIR): + mkdir -p $(OUTDIR) + +$(OUTDIR)/$(WRAPPER): $(OUTDIR) + cp -pf $(WRAPPER) $(OUTDIR) +$(OUTDIR)/$(QLORA_WRAPPER): $(OUTDIR) + cp -pf $(QLORA_WRAPPER) $(OUTDIR) +$(OUTDIR)/$(TRAIN_WRAPPER): $(OUTDIR) + cp -pf $(TRAIN_WRAPPER) $(OUTDIR) + +$(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID): + @mkdir -p $(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID) + $(CONTAINER_TOOL) push --compress=false $(INSTRUCTLAB_IMAGE) oci:$(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID)/ +$(OUTDIR)/$(VLLM_IMAGE_ID): + @mkdir -p $(OUTDIR)/$(VLLM_IMAGE_ID) + $(CONTAINER_TOOL) push --compress=false $(VLLM_IMAGE) oci:$(OUTDIR)/$(VLLM_IMAGE_ID)/ +$(OUTDIR)/$(TRAIN_IMAGE_ID): + @mkdir -p $(OUTDIR)/$(TRAIN_IMAGE_ID) + $(CONTAINER_TOOL) push --compress=false $(TRAIN_IMAGE) oci:$(OUTDIR)/$(TRAIN_IMAGE_ID)/ + +.PHONY: check-sshkey +check-sshkey: + @test -n "$(SSH_PUBKEY)" || \ + (echo -n "Error: no ssh key defined! "; \ + echo "Create ~/.ssh/id_rsa.pub or set SSH_PUBKEY"; exit 1) + +.PHONY: push +push: + podman push "${REGISTRY}/${REGISTRY_ORG}/${IMAGE_NAME}:${IMAGE_TAG}" + +.PHONY: growfs +growfs: + # Add growfs service + mkdir -p build; cp -pR ../common/usr build + +.PHONY: bootc-image-builder +bootc-image-builder: + mkdir -p build/store + podman run \ + --rm \ + -ti \ + -v $(GRAPH_ROOT):/var/lib/containers/storage \ + $(AUTH_JSON:%=-v %:/run/containers/0/auth.json) \ + $(IMAGE_BUILDER_CONFIG:%=-v %:/config.json) \ + --privileged \ + --pull newer \ + -v ./build:/output \ + -v ./build/store:/store \ + $(BOOTC_IMAGE_BUILDER) \ + $(ARCH:%=--target-arch %) \ + $(IMAGE_BUILDER_CONFIG:%=--config /config.json) \ + --type $(DISK_TYPE) \ + --chown $(DISK_UID):$(DISK_GID) \ + --local \ + $(BOOTC_IMAGE) + +.PHONY: clean +clean: + rm -rf build diff --git a/training/common/usr/lib/systemd/system/bootc-generic-growpart.service b/training/common/usr/lib/systemd/system/bootc-generic-growpart.service new file mode 100644 index 0000000..77bb310 --- /dev/null +++ b/training/common/usr/lib/systemd/system/bootc-generic-growpart.service @@ -0,0 +1,20 @@ +[Unit] +Description=Bootc Fallback Root Filesystem Grow +Documentation=https://gitlab.com/fedora/bootc/docs +# For now we skip bare metal cases, and we also have nothing to do +# for containers. +ConditionVirtualization=vm +# This helps verify that we're running in a bootc/ostree based target. +ConditionPathIsMountPoint=/sysroot +# We want to run before any e.g. large container images might be pulled. +DefaultDependencies=no +Requires=sysinit.target +After=sysinit.target +Before=basic.target + +[Service] +ExecStart=/usr/libexec/bootc-generic-growpart +# So we can temporarily remount the sysroot writable +MountFlags=slave +# Just to auto-cleanup our temporary files +PrivateTmp=yes diff --git a/training/common/usr/lib/systemd/system/local-fs.target.wants/bootc-generic-growpart.service b/training/common/usr/lib/systemd/system/local-fs.target.wants/bootc-generic-growpart.service new file mode 120000 index 0000000..c8e2408 --- /dev/null +++ b/training/common/usr/lib/systemd/system/local-fs.target.wants/bootc-generic-growpart.service @@ -0,0 +1 @@ +../bootc-generic-growpart.service \ No newline at end of file diff --git a/training/common/usr/libexec/bootc-generic-growpart b/training/common/usr/libexec/bootc-generic-growpart new file mode 100755 index 0000000..c2277ba --- /dev/null +++ b/training/common/usr/libexec/bootc-generic-growpart @@ -0,0 +1,41 @@ +#!/bin/bash +set -eu + +backing_device=$(findmnt -vno SOURCE /sysroot) +echo "Backing device: ${backing_device}" +syspath=/sys/class/block/$(basename "${backing_device}") +if ! test -d "${syspath}"; then + echo "failed to find backing device ${syspath}"; exit 1 +fi + +# Handling devicemapper targets is a whole other thing +case $backing_device in + /dev/mapper/*) "Not growing $backing_device"; exit 0 ;; +esac + +# Note that we expect that the rootfs is on a partition +partition=$(cat "${syspath}"/partition) + +# Walk up to find the parent blockdev +parentpath=$(dirname "$(realpath "${syspath}")") +devmajmin=$(cat "${parentpath}"/dev) +parent="/dev/block/${devmajmin}" + +# Grow the partition +tmpf=$(mktemp) +# Ignore errors because growpart exits 1 if nothing changed; +# we need to check the output for NOCHANGE: +if ! /usr/bin/growpart "${parent}" "${partition}" > "${tmpf}"; then + cat "${tmpf}" + if grep -qEe '^NOCHANGE: ' "${tmpf}"; then + exit 0 + fi + echo "growpart failed" + exit 1 +fi +cat "${tmpf}" +# Now, temporarily remount the sysroot writable in our mount namespace +mount -o remount,rw /sysroot +# And defer to systemd's growfs wrapper which handles dispatching on +# the target filesystem type. +/usr/lib/systemd/systemd-growfs /sysroot diff --git a/training/deepspeed/Containerfile b/training/deepspeed/Containerfile new file mode 100644 index 0000000..854864d --- /dev/null +++ b/training/deepspeed/Containerfile @@ -0,0 +1,17 @@ +# Containerfile for running deepspeed training +FROM nvcr.io/nvidia/cuda:12.1.1-cudnn8-devel-ubi9 + +RUN dnf install -y python python-devel git +RUN python -m ensurepip --upgrade +RUN pip3 install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu121 +RUN pip3 install packaging wheel +RUN pip3 install flash-attn==2.5.7 +RUN pip3 install deepspeed==0.14.2 +RUN pip3 install transformers==4.40.1 +RUN pip3 install ipdb jupyterlab gpustat matplotlib hydra-core datasets rich numba +RUN git clone https://github.com/instructlab/training.git +RUN mkdir -p /ilab-data/training_output + +WORKDIR /training + +CMD ["/bin/bash"] diff --git a/training/deepspeed/Makefile b/training/deepspeed/Makefile new file mode 100644 index 0000000..c1d7650 --- /dev/null +++ b/training/deepspeed/Makefile @@ -0,0 +1,15 @@ +IMAGE_NAME ?= deepspeed-trainer +CONTAINER_TOOL ?= podman + +default: image + +.PHONY: image +image: + @mkdir -p ../build + rm -rf ../build/deepspeed-trainer + "${CONTAINER_TOOL}" build \ + $(ARCH:%=--platform linux/%) \ + --file Containerfile \ + --layers=false \ + --squash-all \ + --tag oci:../build/deepspeed-trainer . diff --git a/training/ilab-wrapper/ilab b/training/ilab-wrapper/ilab new file mode 100755 index 0000000..faac3d7 --- /dev/null +++ b/training/ilab-wrapper/ilab @@ -0,0 +1,183 @@ +#!/bin/bash + +# Template values replaced by container build +ENDPOINT_URL="__REPLACE_ENDPOINT_URL__" +TRAIN_DEVICE="__REPLACE_TRAIN_DEVICE__" +CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__" +IMAGE_NAME="__REPLACE_IMAGE_NAME__" +VLLM_NAME="__REPLACE_VLLM_NAME__" +TRAIN_NAME="__REPLACE_TRAIN_NAME__" +GPU_COUNT_COMMAND="__REPLACE_GPU_COUNT_COMMAND__" + +# ENDPOINT_URL="http://0.0.0.0:8080/v1" +# TRAIN_DEVICE="cuda" +# CONTAINER_DEVICE="nvidia.com/gpu=all" +# IMAGE_NAME="quay.io/ai-lab/instructlab-nvidia:latest" +# VLLM_NAME="quay.io/ai-lab/vllm:latest" +# TRAIN_NAME="quay.io/ai-lab/deepspeed-trainer:latest" +# GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\d+' | wc -l" + +# HF caching uses relative symlink structures, so keep cache relative to +# the central working directory +CONTAINER_CACHE="/instructlab/cache" +HOST_CACHE="$(pwd)/cache" +WORKDIR="$(pwd)" +SCRIPT_DIR=$(dirname "$0") +DEFAULT_SERVE_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1" + +if [[ -z "${GPU_AMOUNT}" ]]; then + GPU_AMOUNT=$(bash -c "${GPU_COUNT_COMMAND}") + if [[ "$?" != "0" ]]; then + echo "Could not determine GPU count, set export GPU_AMOUNT= manually" + exit + fi +fi + +if [[ "$GPU_AMOUNT" -le 2 ]]; then + echo "WARNING: You need at least 2 GPUs to load full precision models" +fi + +NPROC_PER_NODE=${GPU_AMOUNT} +EFFECTIVE_BATCH_SIZE=$((12*${GPU_AMOUNT})) +NUM_INSTRUCTIONS=5000 +NUM_EPOCHS=10 + +has_argument() { + match=$1 + shift + for arg in "$@"; do + if [[ "$arg" == *"$match"* ]]; then + return 0 + fi + done + return 1 +} + +get_argument() { + local match=$1 + shift + + local found=false + local arg + while [ "$#" -gt 0 ]; do + arg="$1" + shift + if [[ "$arg" == "$match" ]]; then + found=true + if [ "$#" -gt 0 ]; then + echo "$1" + return 0 + else + echo "" + return 0 + fi + fi + done + + if ! $found; then + echo "" + return 0 + fi +} + +get_argument_default() { + local match=$1 + local default=$2 + shift + shift + local result=$(get_argument ${match} "$@") + if [[ -z "${result}" ]]; then + echo $default + return 0 + fi + echo "${result}" +} + +get_model() { + model=$(get_argument_default "--model" "${DEFAULT_SERVE_MODEL}" "$@") + if [[ ! "${model}" =~ ^/instructlab/models.* ]]; then + echo /instructlab/models/"${model}" + else + echo "${model}" + fi +} + +mkdir -p "${HOST_CACHE}" +PODMAN_COMMAND=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \ + "--security-opt" "label=disable" "--net" "host" \ + "-v" "${WORKDIR}:/instructlab" "--entrypoint" "" \ + "-e" "HF_HOME=${CONTAINER_CACHE}" \ + "${IMAGE_NAME}") +PODMAN_COMMAND_SERVE=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \ + "--security-opt" "label=disable" "--net" "host" \ + "-v" "${WORKDIR}:/instructlab" \ + "--shm-size=10gb" \ + "-e" "HF_HOME=${CONTAINER_CACHE}/" \ + "${VLLM_NAME}" "--host=0.0.0.0" "--port=8080" "--tensor-parallel-size=${GPU_AMOUNT}") + +if [[ "$1" = "init" ]]; then + if ! has_argument "--repository" "$@"; then + shift + "${PODMAN_COMMAND[@]}" ilab init \ + --repository https://github.com/instructlab/taxonomy.git "$@" + exit $? + fi +elif [[ "$1" = "train" ]]; then + samples=$(get_argument_default "--num-samples" ${NUM_INSTRUCTIONS} "$@") + epochs=$(get_argument_default "--num-epochs" ${NUM_EPOCHS} "$@") + ${SCRIPT_DIR}/ilab-training-launcher ${NPROC_PER_NODE} ${EFFECTIVE_BATCH_SIZE} \ + ${TRAIN_DEVICE} ${samples} ${epochs} ${CONTAINER_DEVICE} ${TRAIN_NAME} + exit $? +elif [[ "$1" = "serve" ]]; then + # run vllm container which will serve vllm and ilab generate + args=() + model=$(get_model "$@") + if [[ "${model}" == *"${DEFAULT_SERVE_MODEL}" ]]; then + args+=("--chat-template=mixtral.jinja") + fi + args+=("--model" "${model}") + "${PODMAN_COMMAND_SERVE[@]}" "${args[@]}" + exit $? +elif [[ "$1" = "chat" ]]; then + shift + args=($@) + if ! has_argument "--endpoint-url" "$@"; then + args+=("--endpoint-url" "http://0.0.0.0:8080/v1") + fi + if ! has_argument "--model-family" "$@"; then + args+=("--model-family" "mixtral") + fi + args+=("--model" $(get_model "$@")) + "${PODMAN_COMMAND[@]}" ilab chat "${args[@]}" + exit $? +elif [[ "$1" = "generate" ]]; then + shift + args=($@) + if ! has_argument "--endpoint-url" "$@"; then + args+=("--endpoint-url" "http://0.0.0.0:8080/v1") + fi + if ! has_argument "--model-family" "$@"; then + args+=("--model-family" "mixtral") + fi + if ! has_argument "--num-instructions" "$@"; then + args+=("--num-instructions" "5000") + fi + args+=("--model" $(get_model "$@")) + echo ilab generate "${args[@]}" + + "${PODMAN_COMMAND[@]}" ilab generate "${args[@]}" + exit $? +elif [[ "$1" == "download" && $# -lt 2 ]]; then + echo "You must specify the model to download." + echo + echo "High-fidelity generation and training requires two models:" + echo + echo "Mixtral: ilab download --repository ${DEFAULT_SERVE_MODEL}" + echo "Granite: ilab download --repository ibm/granite-7b-base" + echo + echo "For more options type ilab --help" + exit 1 +fi + +"${PODMAN_COMMAND[@]}" ilab "$@" + diff --git a/training/ilab-wrapper/ilab-qlora b/training/ilab-wrapper/ilab-qlora new file mode 100755 index 0000000..5670c11 --- /dev/null +++ b/training/ilab-wrapper/ilab-qlora @@ -0,0 +1,47 @@ +#!/bin/bash + +# Template values replaced by container build +TRAIN_DEVICE="__REPLACE_TRAIN_DEVICE__" +CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__" +CONTAINER_NAME="__REPLACE_CONTAINER_NAME__" + +# HF caching uses relative symlink structures, so keep cache relative to +# the central working directory +CONTAINER_CACHE="/instructlab/cache" +HOST_CACHE="$(pwd)/cache" +WORKDIR="$(pwd)" + +has_argument() { + match=$1 + shift + for arg in "$@"; do + if [[ "$arg" == *"$match"* ]]; then + return 0 + fi + done + return 1 +} + +mkdir -p "${HOST_CACHE}" +PODMAN_COMMAND=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \ + "--security-opt" "label=disable" "--net" "host" \ + "-v" "${WORKDIR}:/instructlab" "--entrypoint" "" \ + "-e" "HF_HOME=${CONTAINER_CACHE}" \ + "${CONTAINER_NAME}") +if [[ "$1" = "init" ]]; then + if ! has_argument "--repository" "$@"; then + shift + "${PODMAN_COMMAND[@]}" ilab init \ + --repository https://github.com/instructlab/taxonomy.git "$@" + exit $? + fi +elif [[ "$1" = "train" ]]; then + if ! has_argument "--device" "$@"; then + shift + "${PODMAN_COMMAND[@]}" ilab train --device ${TRAIN_DEVICE} "$@" + exit $? + fi +fi + +"${PODMAN_COMMAND[@]}" ilab "$@" + diff --git a/training/ilab-wrapper/ilab-training-launcher b/training/ilab-wrapper/ilab-training-launcher new file mode 100755 index 0000000..2c14262 --- /dev/null +++ b/training/ilab-wrapper/ilab-training-launcher @@ -0,0 +1,86 @@ +#!/bin/bash + +if [[ $# -lt 6 ]]; then + echo "error: this is an internal command and not intented for direct execution, instead use ilab" + exit 1 +fi + +NPROC_PER_NODE="$1" +EFFECTIVE_BATCH_SIZE="$2" +TRAIN_DEVICE="$3" +SAMPLE_SIZE="$4" +NUM_EPOCHS="$5" +CONTAINER_DEVICE="$6" +CONTAINER_NAME="$7" +SDG_OUTPUT_PATH="$(pwd)" + +SAVE_SAMPLES=$(($SAMPLE_SIZE - 1)) +TESTING_DATA_PATH="/instructlab/generated" +TRAINING_DATA_PATH="/instructlab/generated" +DATASET_NAME="ilab-generated" +CONTAINER_CACHE="/instructlab/cache" +WORKDIR="$(pwd)" + +PODMAN_COMMAND=("podman" "run" "--device" "${CONTAINER_DEVICE}" \ + "--security-opt" "label=disable" \ + "--entrypoint" "" \ + "-v" "${SDG_OUTPUT_PATH}":/instructlab \ + "${CONTAINER_NAME}") +# Convert ilab generate output to match SDG output format for train and test data +mkdir -p ${SDG_OUTPUT_PATH}/training +"${PODMAN_COMMAND[@]}" bash -c "python /training/ilab_to_sdg.py \"${TRAINING_DATA_PATH}\" train \"${DATASET_NAME}\"; mv sdg_out.jsonl /instructlab/training/train.jsonl" +"${PODMAN_COMMAND[@]}" bash -c "python /training/ilab_to_sdg.py \"${TESTING_DATA_PATH}\" test \"${DATASET_NAME}\"; mv sdg_out.jsonl /instructlab/training/test.jsonl" + +# Add curated subset of taxonomy +"${PODMAN_COMMAND[@]}" bash -c "cat /training/sample-data/train_all_pruned_SDG.jsonl >> /instructlab/training/train.jsonl" + +# Pre-process generated data before training +"${PODMAN_COMMAND[@]}" bash -c \ +"python data_process.py --logging_level INFO \ +--data_path /instructlab/training/train.jsonl \ +--data_output_path=/instructlab/training \ +--max_seq_len 4096 \ +--model_name_or_path /instructlab/models/ibm/granite-7b-base" + +PODMAN_COMMAND=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \ + "--shm-size=10g" "--security-opt" "label=disable" "--net" "host" \ + "-v" "${WORKDIR}:/instructlab" "--entrypoint" "" \ + "-e" "HF_HOME=${CONTAINER_CACHE}" \ + "${CONTAINER_NAME}") +mkdir -p training_output +# Run training +"${PODMAN_COMMAND[@]}" \ +torchrun \ +--nnodes 1 \ +--node_rank 0 \ +--nproc_per_node ${NPROC_PER_NODE} \ +--rdzv_id 101 \ +--rdzv_endpoint 0.0.0.0:8888 /training/main_ds.py \ +--model_name_or_path /instructlab/models/ibm/granite-7b-base \ +--data_path /instructlab/training/data.jsonl \ +--output_dir="/instructlab/training_output" \ +--num_epochs=${NUM_EPOCHS} \ +--effective_batch_size=${EFFECTIVE_BATCH_SIZE} \ +--learning_rate=2e-5 \ +--num_warmup_steps=385 \ +--save_samples=${SAVE_SAMPLES} \ +--log_level="INFO" \ +--sharding_strategy='HYBRID_SHARD' \ +--seed=19347 | tee training_output/0.log + +echo +echo + +if [[ -d "${SDG_OUTPUT_PATH}/training_output/hf_format" ]]; then + month=$(date +'%m') + day=$(date +'%d') + hour=$(date +'%H') + min=$(date +'%M') + + dest=${SDG_OUTPUT_PATH}/models/tuned-${month}${day}-${hour}${min} + mv training_output/hf_format "${dest}" + echo "Generated model in ${dest}:" + (cd ${dest}; find . -type d) +else + echo "Warning: No results were written!" +fi diff --git a/training/instructlab/Makefile b/training/instructlab/Makefile new file mode 100644 index 0000000..a33d276 --- /dev/null +++ b/training/instructlab/Makefile @@ -0,0 +1,24 @@ +default: instructlab + +REGISTRY ?= quay.io +REGISTRY_ORG ?= ai-lab +IMAGE_TAG ?= latest + +INSTRUCTLAB_GIT_REPO ?= https://github.com/instructlab/instructlab.git +INSTRUCTLAB_GIT_BRANCH ?= main + +.PHONY: instructlab +instructlab: + @mkdir -p ../build + git clone $(INSTRUCTLAB_GIT_REPO) instructlab 2> /dev/null || true + (cd instructlab; git pull origin $(INSTRUCTLAB_GIT_BRANCH)) + +.PHONY: nvidia +nvidia: instructlab + rm -rf ../build/instructlab-$@ + podman build --layers=false --squash-all -t oci:../build/instructlab-$@ instructlab/containers/cuda + +.PHONY: amd +amd: instructlab + rm -rf ../build/instructlab-$@ + podman build --layers=false --squash-all -t oci:../build/instructlab-$@ -f instructlab/containers/rocm/Containerfile instructlab diff --git a/training/intel-bootc/Containerfile b/training/intel-bootc/Containerfile new file mode 100644 index 0000000..93d37d3 --- /dev/null +++ b/training/intel-bootc/Containerfile @@ -0,0 +1,83 @@ +ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9" + +FROM ${BASEIMAGE} as builder + +ARG OS_VERSION_MAJOR='' +ARG DRIVER_VERSION=1.15.1-15 +ARG TARGET_ARCH='' +ARG KERNEL_VERSION='' + +RUN if [ "${OS_VERSION_MAJOR}" == "" ]; then \ + . /etc/os-release \ + && export OS_VERSION_MAJOR="$(echo ${VERSION} | cut -d'.' -f 1)" ;\ + fi \ + && if [ "${TARGET_ARCH}" == "" ]; then \ + export TARGET_ARCH=$(arch) ;\ + fi \ + && if [ "${KERNEL_VERSION}" == "" ]; then \ + RELEASE=$(dnf info --installed kernel-core | grep Release | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && VERSION=$(dnf info --installed kernel-core | grep Version | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && export KERNEL_VERSION="${VERSION}-${RELEASE}" ;\ + fi \ + && dnf install -y make git kernel-devel-${KERNEL_VERSION} \ + && mkdir /tmp/habanalabs \ + && cd /tmp/habanalabs \ + && curl -LO https://vault.habana.ai/artifactory/rhel/${OS_VERSION_MAJOR}/9.2/habanalabs-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.noarch.rpm \ + && curl -LO https://vault.habana.ai/artifactory/rhel/${OS_VERSION_MAJOR}/9.2/habanalabs-firmware-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.${TARGET_ARCH}.rpm \ + && curl -LO https://vault.habana.ai/artifactory/rhel/${OS_VERSION_MAJOR}/9.2/habanalabs-rdma-core-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.noarch.rpm \ + && curl -LO https://vault.habana.ai/artifactory/rhel/${OS_VERSION_MAJOR}/9.2/habanalabs-thunk-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.${TARGET_ARCH}.rpm \ + && curl -LO https://vault.habana.ai/artifactory/rhel/${OS_VERSION_MAJOR}/9.2/habanalabs-firmware-tools-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.${TARGET_ARCH}.rpm \ + && rpm2cpio habanalabs-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.noarch.rpm | cpio -id \ + && rpm2cpio habanalabs-firmware-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.${TARGET_ARCH}.rpm | cpio -id \ + && rpm2cpio habanalabs-firmware-tools-${DRIVER_VERSION}.el${OS_VERSION_MAJOR}.${TARGET_ARCH}.rpm | cpio -id \ + && cd ./usr/src/habanalabs-${DRIVER_VERSION} \ + && make -j$(nproc) KVERSION="${KERNEL_VERSION}.${TARGET_ARCH}" GIT_SHA=$(cat dkms.conf | grep "KMD_LAST_GIT_SHA=" | cut -d "=" -f 2) \ + && make -j$(nproc) KVERSION="${KERNEL_VERSION}.${TARGET_ARCH}" GIT_SHA=$(cat dkms.conf | grep "KMD_LAST_GIT_SHA=" | cut -d "=" -f 2) -f Makefile.nic \ + && xz drivers/accel/habanalabs/habanalabs.ko \ + && xz drivers/net/ethernet/intel/hl_cn/habanalabs_cn.ko \ + && xz drivers/net/ethernet/intel/hl_en/habanalabs_en.ko \ + && cd drivers/infiniband/hw/hlib && make KVERSION="${KERNEL_VERSION}.${TARGET_ARCH}" \ + && xz habanalabs_ib.ko \ + && cp /tmp/habanalabs/usr/src/habanalabs-${DRIVER_VERSION}/drivers/accel/habanalabs/habanalabs.ko.xz /tmp/ \ + && cp /tmp/habanalabs/usr/src/habanalabs-${DRIVER_VERSION}/drivers/infiniband/hw/hlib/habanalabs_ib.ko.xz /tmp/ \ + && cp /tmp/habanalabs/usr/src/habanalabs-${DRIVER_VERSION}/drivers/net/ethernet/intel/hl_cn/habanalabs_cn.ko.xz /tmp/ \ + && cp /tmp/habanalabs/usr/src/habanalabs-${DRIVER_VERSION}/drivers/net/ethernet/intel/hl_en/habanalabs_en.ko.xz /tmp/ \ + && cp /tmp/habanalabs/usr/bin/hl-smi /tmp/ + +FROM ${BASEIMAGE} + +ARG KERNEL_VERSION='' +ARG EXTRA_RPM_PACKAGES='' + +COPY --from=builder --chown=0:0 /tmp/habanalabs/lib/firmware/habanalabs/gaudi /lib/firmware/habanalabs/gaudi +COPY --from=builder --chown=0:0 /tmp/habanalabs/lib/firmware/habanalabs/gaudi2 /lib/firmware/habanalabs/gaudi2 +COPY --from=builder --chown=0:0 /tmp/habanalabs/lib/firmware/habanalabs/gaudi3 /lib/firmware/habanalabs/gaudi3 +COPY --from=builder --chown=0:0 /tmp/habanalabs*.xz /tmp/ +COPY --from=builder --chown=0:0 /tmp/hl-smi /usr/bin/ + +# Include growfs service +COPY build/usr /usr + +ARG INSTRUCTLAB_IMAGE +ARG VLLM_IMAGE + +RUN if [ "${KERNEL_VERSION}" == "" ]; then \ + RELEASE=$(dnf info --installed kernel-core | grep Release | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && VERSION=$(dnf info --installed kernel-core | grep Version | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && export KERNEL_VERSION="${VERSION}-${RELEASE}" ;\ + fi \ + && if [ "${TARGET_ARCH}" == "" ]; then \ + export TARGET_ARCH=$(arch) ;\ + fi \ + && mkdir -p /lib/modules/${KERNEL_VERSION}.${TARGET_ARCH}/extra/ \ + && mv /tmp/*.xz /lib/modules/${KERNEL_VERSION}.${TARGET_ARCH}/extra/ \ + && chown root:root /lib/modules/${KERNEL_VERSION}.${TARGET_ARCH}/extra/habanalabs*.xz \ + && depmod -a ${KERNEL_VERSION}.${TARGET_ARCH} \ + && xargs --no-run-if-empty dnf install -y <<< "${EXTRA_RPM_PACKAGES}" \ + && dnf clean all + +# Prepull the instructlab image +RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/vllm) && \ + podman --root /usr/lib/containers/storage image tag ${IID} ${VLLM_IMAGE} +#RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/instructlab-intel) && \ +# podman --root /usr/lib/containers/storage image tag ${IID} ${INSTRUCTLAB_IMAGE} diff --git a/training/intel-bootc/Makefile b/training/intel-bootc/Makefile new file mode 100644 index 0000000..b0bca85 --- /dev/null +++ b/training/intel-bootc/Makefile @@ -0,0 +1,22 @@ +IMAGE_NAME ?= intel-bootc + +include ../common/Makefile.common + +default: bootc + +.PHONY: bootc +bootc: growfs + "${CONTAINER_TOOL}" build \ + $(ARCH:%=--platform linux/%) \ + --security-opt label=disable \ + --cap-add SYS_ADMIN \ + --file Containerfile \ + --tag "${BOOTC_IMAGE}" \ + -v ${OUTDIR}:/run/.input:ro \ + $(EXTRA_RPM_PACKAGES:%=--build-arg EXTRA_RPM_PACKAGES=%) \ + $(FROM:%=--build-arg BASEIMAGE=%) \ + $(DRIVER_VERSION:%=--build-arg DRIVER_VERSION=%) \ + $(KERNEL_VERSION:%=--build-arg KERNEL_VERSION=%) \ + --build-arg "INSTRUCTLAB_IMAGE=$(INSTRUCTLAB_IMAGE)" \ + --build-arg "VLLM_IMAGE=$(VLLM_IMAGE)" \ + ${CONTAINER_TOOL_EXTRA_ARGS} . diff --git a/training/model/Containerfile b/training/model/Containerfile new file mode 100644 index 0000000..e115374 --- /dev/null +++ b/training/model/Containerfile @@ -0,0 +1,11 @@ +FROM registry.access.redhat.com/ubi9/ubi + +ARG MODEL_REPO='' +ARG MODEL_NAME='' +ARG MODEL_PATH='' + +RUN dnf install -y python3-pip && python3 -m pip install huggingface_hub +RUN mkdir -p "${MODEL_PATH}" \ + && echo from huggingface_hub import snapshot_download > /root/hf_download \ + && echo snapshot_download\(repo_id=\'${MODEL_REPO}\', local_dir=\'${MODEL_PATH}\', local_dir_use_symlinks=False\) >> /root/hf_download \ + && python3 /root/hf_download diff --git a/training/model/Makefile b/training/model/Makefile new file mode 100644 index 0000000..de2d0ba --- /dev/null +++ b/training/model/Makefile @@ -0,0 +1,22 @@ +FROM ?= + +REGISTRY ?= quay.io +REGISTRY_ORG ?= ai-lab +IMAGE_NAME ?= granite-7b-lab +IMAGE_TAG ?= latest + +CONTAINER_TOOL ?= podman +CONTAINER_TOOL_EXTRA_ARGS ?= + +MODEL_REPO ?= ibm/granite-7b-base +MODEL_PATH ?= /usr/share/ai-model + +.PHONY: image +image: + "${CONTAINER_TOOL}" build \ + --file Containerfile \ + --tag "${REGISTRY}/${REGISTRY_ORG}/${IMAGE_NAME}:${IMAGE_TAG}" \ + $(FROM:%=--build-arg BASEIMAGE=%) \ + $(MODEL_REPO:%=--build-arg MODEL_REPO=%) \ + $(MODEL_PATH:%=--build-arg MODEL_PATH=%) \ + ${CONTAINER_TOOL_EXTRA_ARGS} diff --git a/training/nvidia-bootc/Containerfile b/training/nvidia-bootc/Containerfile new file mode 100644 index 0000000..008f0fb --- /dev/null +++ b/training/nvidia-bootc/Containerfile @@ -0,0 +1,183 @@ +ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/nvidia-builder:latest" +ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9" + +FROM ${DRIVER_TOOLKIT_IMAGE} as builder + +ARG BASE_URL='https://us.download.nvidia.com/tesla' + +ARG OS_VERSION_MAJOR='' +ARG KERNEL_VERSION='' + +ARG BUILD_ARCH='' +ARG TARGET_ARCH='' + +ARG DRIVER_VERSION='550.54.15' + +USER builder + +WORKDIR /home/builder +COPY --chown=1001:0 x509-configuration.ini x509-configuration.ini + +# Include growfs service +COPY build/usr /usr + +RUN if [ "${KERNEL_VERSION}" == "" ]; then \ + RELEASE=$(dnf info kernel-core | grep Release | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && VERSION=$(dnf info kernel-core | grep Version | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && export KERNEL_VERSION="${VERSION}-${RELEASE}" ;\ + fi \ + && if [ "${OS_VERSION_MAJOR}" == "" ]; then \ + . /etc/os-release \ + && export OS_ID="$(echo ${ID})" \ + && export OS_VERSION_MAJOR="$(echo ${VERSION} | cut -d'.' -f 1)" ;\ + fi \ + && if [ "${BUILD_ARCH}" == "" ]; then \ + export BUILD_ARCH=$(arch) \ + && export TARGET_ARCH=$(echo "${BUILD_ARCH}" | sed 's/+64k//') ;\ + fi \ + && export KVER=$(echo ${KERNEL_VERSION} | cut -d '-' -f 1) \ + && KREL=$(echo ${KERNEL_VERSION} | cut -d '-' -f 2 | sed 's/\.el._*.*\..\+$//' | cut -d'.' -f 1) \ + && if [ "${OS_ID}" == "rhel" ]; then \ + KDIST="."$(echo ${KERNEL_VERSION} | cut -d '-' -f 2 | cut -d '.' -f 2-) ;\ + else \ + KDIST="."$(echo ${KERNEL_VERSION} | cut -d '-' -f 2 | sed 's/^.*\(\.el._*.*\)\..\+$/\1/' | cut -d'.' -f 2) ;\ + fi \ + && DRIVER_STREAM=$(echo ${DRIVER_VERSION} | cut -d '.' -f 1) \ + && git clone --depth 1 --single-branch -b rhel${OS_VERSION_MAJOR} https://github.com/NVIDIA/yum-packaging-precompiled-kmod \ + && cd yum-packaging-precompiled-kmod \ + && mkdir BUILD BUILDROOT RPMS SRPMS SOURCES SPECS \ + && mkdir nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH} \ + && curl -sLOf ${BASE_URL}/${DRIVER_VERSION}/NVIDIA-Linux-${TARGET_ARCH}-${DRIVER_VERSION}.run \ + && sh ./NVIDIA-Linux-${TARGET_ARCH}-${DRIVER_VERSION}.run --extract-only --target tmp \ + && mv tmp/kernel-open nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH}/kernel \ + && tar -cJf SOURCES/nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH}.tar.xz nvidia-kmod-${DRIVER_VERSION}-${BUILD_ARCH} \ + && mv kmod-nvidia.spec SPECS/ \ + && openssl req -x509 -new -nodes -utf8 -sha256 -days 36500 -batch \ + -config ${HOME}/x509-configuration.ini \ + -outform DER -out SOURCES/public_key.der \ + -keyout SOURCES/private_key.priv \ + && rpmbuild \ + --define "% _arch ${BUILD_ARCH}" \ + --define "%_topdir $(pwd)" \ + --define "debug_package %{nil}" \ + --define "kernel ${KVER}" \ + --define "kernel_release ${KREL}" \ + --define "kernel_dist ${KDIST}" \ + --define "driver ${DRIVER_VERSION}" \ + --define "driver_branch ${DRIVER_STREAM}" \ + -v -bb SPECS/kmod-nvidia.spec + +FROM ${BASEIMAGE} + +ARG BASE_URL='https://us.download.nvidia.com/tesla' + +ARG OS_VERSION_MAJOR='' +ARG KERNEL_VERSION='' + + +ARG DRIVER_TYPE=passthrough +ENV NVIDIA_DRIVER_TYPE=${DRIVER_TYPE} + +ARG DRIVER_VERSION='550.54.15' +ENV NVIDIA_DRIVER_VERSION=${DRIVER_VERSION} +ARG CUDA_VERSION='12.3.2' + +ARG TARGET_ARCH='' +ENV TARGETARCH=${TARGET_ARCH} + +ARG EXTRA_RPM_PACKAGES='' + +# Disable vGPU version compatibility check by default +ARG DISABLE_VGPU_VERSION_CHECK=true +ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK + +USER root + +COPY --from=builder /home/builder/yum-packaging-precompiled-kmod/RPMS/*/*.rpm /rpms/ +COPY --from=builder --chmod=444 /home/builder/yum-packaging-precompiled-kmod/tmp/firmware/*.bin /lib/firmware/nvidia/${DRIVER_VERSION}/ + +RUN dnf install -y /rpms/kmod-nvidia-*.rpm + +COPY nvidia-toolkit-firstboot.service /usr/lib/systemd/system/nvidia-toolkit-firstboot.service + +RUN if [ "${TARGET_ARCH}" == "" ]; then \ + export TARGET_ARCH="$(arch)" ;\ + fi \ + && if [ "${OS_VERSION_MAJOR}" == "" ]; then \ + . /etc/os-release \ + && export OS_VERSION_MAJOR="$(echo ${VERSION} | cut -d'.' -f 1)" ;\ + fi \ + && export DRIVER_STREAM=$(echo ${DRIVER_VERSION} | cut -d '.' -f 1) \ + CUDA_VERSION_ARRAY=(${CUDA_VERSION//./ }) \ + CUDA_DASHED_VERSION=${CUDA_VERSION_ARRAY[0]}-${CUDA_VERSION_ARRAY[1]} \ + CUDA_REPO_ARCH=${TARGET_ARCH} \ + && if [ "${TARGET_ARCH}" == "aarch64" ]; then CUDA_REPO_ARCH="sbsa"; fi \ + && dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save \ + && dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel${OS_VERSION_MAJOR}/${CUDA_REPO_ARCH}/cuda-rhel${OS_VERSION_MAJOR}.repo \ + && dnf -y module enable nvidia-driver:${DRIVER_STREAM}/default \ + && dnf install -y \ + nvidia-driver-cuda-${DRIVER_VERSION} \ + nvidia-driver-libs-${DRIVER_VERSION} \ + nvidia-driver-NVML-${DRIVER_VERSION} \ + cuda-compat-${CUDA_DASHED_VERSION} \ + cuda-cudart-${CUDA_DASHED_VERSION} \ + nvidia-persistenced-${DRIVER_VERSION} \ + nvidia-container-toolkit \ + ${EXTRA_RPM_PACKAGES} \ + && if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGET_ARCH" != "arm64" ]; then \ + versionArray=(${DRIVER_VERSION//./ }); \ + DRIVER_BRANCH=${versionArray[0]}; \ + dnf module enable -y nvidia-driver:${DRIVER_BRANCH} && \ + dnf install -y nvidia-fabric-manager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \ + fi \ + && dnf clean all \ + && ln -s /usr/lib/systemd/system/nvidia-toolkit-firstboot.service /usr/lib/systemd/system/basic.target.wants/nvidia-toolkit-firstboot.service \ + && echo "blacklist nouveau" > /etc/modprobe.d/blacklist_nouveau.conf + + +ARG SSHPUBKEY + +# The --build-arg "SSHPUBKEY=$(cat ~/.ssh/id_rsa.pub)" option inserts your +# public key into the image, allowing root access via ssh. +RUN set -eu; mkdir -p /usr/ssh && \ + echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \ + echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys + +# Setup /usr/lib/containers/storage as an additional store for images. +# Remove once the base images have this set by default. +# Also make sure not to duplicate if a base image already has it specified. +RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \ + sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \ + /etc/containers/storage.conf && \ + cp /run/.input/ilab* /usr/local/bin/ + + +ARG INSTRUCTLAB_IMAGE +ARG INSTRUCTLAB_IMAGE_ID +ARG VLLM_IMAGE +ARG VLLM_IMAGE_ID +ARG TRAIN_IMAGE +ARG TRAIN_IMAGE_ID +ARG GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\\\\d+' | wc -l" + +RUN for i in /usr/local/bin/ilab*; do \ + sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \ + sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' $i; \ + sed -i "s%__REPLACE_IMAGE_NAME__%${INSTRUCTLAB_IMAGE}%" $i; \ + sed -i "s%__REPLACE_VLLM_NAME__%${VLLM_IMAGE}%" $i; \ + sed -i "s%__REPLACE_TRAIN_NAME__%${TRAIN_IMAGE}%" $i; \ + sed -i 's%__REPLACE_ENDPOINT_URL__%http://0.0.0.0:8080/v1%' $i; \ + sed -i "s%__REPLACE_GPU_COUNT_COMMAND__%${GPU_COUNT_COMMAND}%" $i; \ + sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \ + done + +# Added for running as an OCI Container to prevent Overlay on Overlay issues. +VOLUME /var/lib/containers + +RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/vllm) && \ + podman --root /usr/lib/containers/storage image tag ${IID} ${VLLM_IMAGE} +RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/instructlab-nvidia) && \ + podman --root /usr/lib/containers/storage image tag ${IID} ${INSTRUCTLAB_IMAGE} +RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/deepspeed-trainer) && \ + podman --root /usr/lib/containers/storage image tag ${IID} ${TRAIN_IMAGE} +RUN podman system reset --force 2>/dev/null diff --git a/training/nvidia-bootc/Containerfile.builder b/training/nvidia-bootc/Containerfile.builder new file mode 100644 index 0000000..f7eab57 --- /dev/null +++ b/training/nvidia-bootc/Containerfile.builder @@ -0,0 +1,56 @@ +FROM quay.io/centos/centos:stream9 + +ARG KERNEL_VERSION='' +ARG ENABLE_RT='' + +USER root + +RUN if [ "${KERNEL_VERSION}" == "" ]; then \ + RELEASE=$(dnf info kernel-core | grep Release | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && VERSION=$(dnf info kernel-core | grep Version | awk -F: '{print $2}' | tr -d '[:blank:]') \ + && export KERNEL_VERSION="${VERSION}-${RELEASE}" ;\ + fi \ + && echo "${KERNEL_VERSION}" \ + && dnf -y install dnf-plugin-config-manager \ + && dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save \ + && dnf -y install \ + kernel-devel-${KERNEL_VERSION} \ + kernel-modules-${KERNEL_VERSION} \ + kernel-modules-extra-${KERNEL_VERSION} \ + && if [ "${ENABLE_RT}" ] && [ $(arch) == "x86_64" ]; then \ + dnf -y --enablerepo=rt install \ + kernel-rt-devel-${KERNEL_VERSION} \ + kernel-rt-modules-${KERNEL_VERSION} \ + kernel-rt-modules-extra-${KERNEL_VERSION}; \ + fi \ + && export INSTALLED_KERNEL=$(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core-${KERNEL_VERSION}) \ + && export GCC_VERSION=$(cat /lib/modules/${INSTALLED_KERNEL}/config | grep -Eo "gcc \(GCC\) ([0-9\.]+)" | grep -Eo "([0-9\.]+)") \ + && dnf -y install \ + binutils \ + diffutils \ + elfutils-libelf-devel \ + jq \ + kabi-dw kernel-abi-stablelists \ + keyutils \ + kmod \ + gcc-${GCC_VERSION} \ + git \ + make \ + mokutil \ + openssl \ + pinentry \ + rpm-build \ + xz \ + && dnf clean all \ + && useradd -u 1001 -m -s /bin/bash builder + +# Last layer for metadata for mapping the driver-toolkit to a specific kernel version +RUN if [ "${KERNEL_VERSION}" == "" ]; then \ + export INSTALLED_KERNEL=$(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core); \ + else \ + export INSTALLED_KERNEL=$(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core-${KERNEL_VERSION}) ;\ + fi \ + && echo "{ \"KERNEL_VERSION\": \"${INSTALLED_KERNEL}\" }" > /etc/driver-toolkit-release.json \ + && echo -e "KERNEL_VERSION=\"${INSTALLED_KERNEL}\"" > /etc/driver-toolkit-release.sh + +USER builder diff --git a/training/nvidia-bootc/Makefile b/training/nvidia-bootc/Makefile new file mode 100644 index 0000000..8aceeb3 --- /dev/null +++ b/training/nvidia-bootc/Makefile @@ -0,0 +1,50 @@ +VENDOR ?= nvidia +IMAGE_NAME ?= $(VENDOR)-bootc +DTK_IMAGE_NAME ?= $(VENDOR)-builder +DTK_IMAGE_TAG ?= latest +DRIVER_TOOLKIT_IMAGE = "${REGISTRY}/${REGISTRY_ORG}/${DTK_IMAGE_NAME}:${DTK_IMAGE_TAG}" + +CUDA_VERSION ?= +OS_VERSION_MAJOR ?= +ENABLE_RT ?= + +include ../common/Makefile.common + +default: bootc + +.PHONY: dtk +dtk: + "${CONTAINER_TOOL}" build \ + $(ARCH:%=--platform linux/%) \ + $(KERNEL_VERSION:%=--build-arg KERNEL_VERSION=%) \ + $(ENABLE_RT:%=--build-arg ENABLE_RC=%) \ + --file Containerfile.builder \ + --tag "${DRIVER_TOOLKIT_IMAGE}" \ + $(FROM:%=--from=%) \ + ${CONTAINER_TOOL_EXTRA_ARGS} . + +.PHONY: bootc +bootc: dtk check-sshkey prepare-files growfs + "${CONTAINER_TOOL}" build \ + --security-opt label=disable \ + --cap-add SYS_ADMIN \ + $(ARCH:%=--platform linux/%) \ + --file Containerfile \ + -v ${OUTDIR}:/run/.input:ro \ + --tag "${BOOTC_IMAGE}" \ + $(KERNEL_VERSION:%=--build-arg KERNEL_VERSION=%) \ + $(OS_VERSION_MAJOR:%=--build-arg OS_VERSION_MAJOR=%) \ + $(FROM:%=--build-arg BASEIMAGE=%) \ + $(EXTRA_RPM_PACKAGES:%=--build-arg EXTRA_RPM_PACKAGES=%) \ + --build-arg DRIVER_TOOLKIT_IMAGE=${DRIVER_TOOLKIT_IMAGE} \ + $(DRIVER_VERSION:%=--label driver-version=%) \ + $(DRIVER_VERSION:%=--build-arg DRIVER_VERSION=%) \ + $(CUDA_VERSION:%=--build-arg CUDA_VERSION=%) \ + --build-arg "INSTRUCTLAB_IMAGE=$(INSTRUCTLAB_IMAGE)" \ + --build-arg "INSTRUCTLAB_IMAGE_ID=$(INSTRUCTLAB_IMAGE_ID)" \ + --build-arg "VLLM_IMAGE=$(VLLM_IMAGE)" \ + --build-arg "VLLM_IMAGE_ID=$(VLLM_IMAGE_ID)" \ + --build-arg "TRAIN_IMAGE=$(TRAIN_IMAGE)" \ + --build-arg "TRAIN_IMAGE_ID=$(TRAIN_IMAGE_ID)" \ + --build-arg "SSHPUBKEY=$(SSH_PUBKEY)" \ + ${CONTAINER_TOOL_EXTRA_ARGS} . diff --git a/training/nvidia-bootc/nvidia-toolkit-firstboot.service b/training/nvidia-bootc/nvidia-toolkit-firstboot.service new file mode 100644 index 0000000..82a36e0 --- /dev/null +++ b/training/nvidia-bootc/nvidia-toolkit-firstboot.service @@ -0,0 +1,13 @@ +[Unit] +# For more information see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html +# It looks like the podman/CDI integration wants a pre-generated list of hardware +Description=Generate /etc/cdi/nvidia.yaml + +[Service] +Type=oneshot +ExecStart=nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml +RemainAfterExit=yes + +[Install] +# TODO: Ensure we have a target that is like "container setup" +WantedBy=multi-user.target diff --git a/training/nvidia-bootc/x509-configuration.ini b/training/nvidia-bootc/x509-configuration.ini new file mode 100644 index 0000000..60fee41 --- /dev/null +++ b/training/nvidia-bootc/x509-configuration.ini @@ -0,0 +1,15 @@ +[ req ] +default_bits = 4096 +distinguished_name = req_distinguished_name +prompt = no +string_mask = utf8only +x509_extensions = myexts +[ req_distinguished_name ] +O = Project Magma +CN = Project Magma +emailAddress = magma@acme.com +[ myexts ] +basicConstraints=critical,CA:FALSE +keyUsage=digitalSignature +subjectKeyIdentifier=hash +authorityKeyIdentifier=keyid diff --git a/training/provision/ansible.cfg b/training/provision/ansible.cfg new file mode 100644 index 0000000..00deea9 --- /dev/null +++ b/training/provision/ansible.cfg @@ -0,0 +1,2 @@ +[ssh_connection] +ssh_args = '-o StrictHostKeyChecking=no' diff --git a/training/provision/playbook.yml b/training/provision/playbook.yml new file mode 100644 index 0000000..d1723c4 --- /dev/null +++ b/training/provision/playbook.yml @@ -0,0 +1,77 @@ +--- +- name: Test Environment Provisioning + hosts: test_environments + remote_user: ec2-user + become: true + gather_facts: false + + tasks: + + - name: Wait until the instance is ready + ansible.builtin.wait_for_connection: + delay: 15 + timeout: 180 + + - name: Gather facts for first time + ansible.builtin.setup: + + - name: Required packages + ansible.builtin.dnf: + name: + - https://s3.eu-west-2.amazonaws.com/amazon-ssm-eu-west-2/latest/linux_amd64/amazon-ssm-agent.rpm + - podman + state: present + disable_gpg_check: true + + - name: Derived Image Containerfile + ansible.builtin.template: + src: ./templates/Containerfile.j2 + dest: /tmp/Containerfile + + - name: Login to default registry + containers.podman.podman_login: + username: "{{ registry_user }}" + password: "{{ registry_password }}" + registry: quay.io + authfile: /etc/containers/auth.json + + - name: Build the Bootc Image + async: 1000 + poll: 0 + register: build_result + ansible.builtin.shell: | + podman build -t quay.io/ai-lab/derived_image:latest -f /tmp/Containerfile --authfile=/etc/containers/auth.json . + podman push quay.io/ai-lab/derived_image:latest --authfile=/etc/containers/auth.json + + - name: Check on Build Bootc Image + async_status: + jid: "{{ build_result.ansible_job_id }}" + register: job_result + until: job_result.finished + retries: 100 + delay: 10 + + - name: Install the Bootc Image + async: 1000 + poll: 0 + register: install_result + ansible.builtin.shell: | + podman run --authfile=/etc/containers/auth.json --rm --privileged --pid=host --security-opt label=type:unconfined_t -v /etc/containers/:/etc/containers -v /:/target -v /var/lib/containers:/var/lib/containers quay.io/ai-lab/derived_image:latest bootc install to-filesystem --karg=console=ttyS0,115200n8 --replace=alongside /target + + - name: Check on Install Bootc Image + async_status: + jid: "{{ install_result.ansible_job_id }}" + register: job_result + until: job_result.finished + retries: 100 + delay: 10 + + - name: Remove the host from the known_host file + ansible.builtin.known_hosts: + name: "{{ inventory_hostname }}" + state: absent + delegate_to: localhost + + - name: Reboot + ansible.builtin.shell: systemctl reboot + ignore_errors: true diff --git a/training/provision/requirements.yml b/training/provision/requirements.yml new file mode 100644 index 0000000..da8ae83 --- /dev/null +++ b/training/provision/requirements.yml @@ -0,0 +1,4 @@ +--- +collections: + - name: containers.podman + version: 1.13.0 diff --git a/training/provision/templates/Containerfile.j2 b/training/provision/templates/Containerfile.j2 new file mode 100644 index 0000000..061a0b9 --- /dev/null +++ b/training/provision/templates/Containerfile.j2 @@ -0,0 +1,9 @@ +FROM quay.io/ai-lab/{{ image_name }}:latest + +USER root + +RUN mkdir /usr/etc-system && \ + chown -R root:root /usr/etc-system && \ + echo 'AuthorizedKeysFile /usr/etc-system/root.keys' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \ + echo {{ ssh_public_key }} > /usr/etc-system/root.keys && \ + chmod 0600 /usr/etc-system/root.keys diff --git a/training/vllm/Containerfile b/training/vllm/Containerfile new file mode 100644 index 0000000..4b165bc --- /dev/null +++ b/training/vllm/Containerfile @@ -0,0 +1,5 @@ +FROM quay.io/wxpe/tgis-vllm:release.4e3ff78 + +USER root +RUN ln -s /usr/lib64/libcuda.so.1 /usr/lib64/libcuda.so +COPY mixtral.jinja . \ No newline at end of file diff --git a/training/vllm/Makefile b/training/vllm/Makefile new file mode 100644 index 0000000..3668c61 --- /dev/null +++ b/training/vllm/Makefile @@ -0,0 +1,14 @@ +CONTAINER_TOOL ?= podman + +default: image + +.PHONY: image +image: + @mkdir -p ../build + rm -rf ../build/vllm + "${CONTAINER_TOOL}" build \ + $(ARCH:%=--platform linux/%) \ + --file Containerfile \ + --layers=false \ + --squash-all \ + --tag oci:../build/vllm . diff --git a/training/vllm/mixtral.jinja b/training/vllm/mixtral.jinja new file mode 100644 index 0000000..65209bc --- /dev/null +++ b/training/vllm/mixtral.jinja @@ -0,0 +1,12 @@ +{% set bos_token = "" %} + +{% set eos_token = "" %} + +{{ bos_token }} +{% for message in messages %} +{% if message['role'] == 'user' %} +{{ '[INST] ' + message['content'] + ' [/INST]' }} +{% elif message['role'] == 'assistant' %} +{{ message['content'] + eos_token}} +{% endif %} +{% endfor %}