From 1ea38471ad13b02791de9bf8b95206d2a33c734a Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Thu, 25 Apr 2024 14:56:44 -0700
Subject: [PATCH 01/17] adds TRT-LLM example (#717)

---
 06_gpu_and_ml/llm-serving/trtllm_llama.py | 617 ++++++++++++++++++++++
 1 file changed, 617 insertions(+)
 create mode 100644 06_gpu_and_ml/llm-serving/trtllm_llama.py

diff --git a/06_gpu_and_ml/llm-serving/trtllm_llama.py b/06_gpu_and_ml/llm-serving/trtllm_llama.py
new file mode 100644
index 000000000..08bd46569
--- /dev/null
+++ b/06_gpu_and_ml/llm-serving/trtllm_llama.py
@@ -0,0 +1,617 @@
+# # Serverless TensorRT-LLM (LLaMA 3 8B)
+#
+# In this example, we demonstrate how to use the TensorRT-LLM framework to serve Meta's LLaMA 3 8B model
+# at a total throughput of roughly 4,500 output tokens per second on a single NVIDIA A100 40GB GPU.
+# At [Modal's on-demand rate](https://modal.com/pricing) of ~$4/hr, that's under $0.20 per million tokens --
+# on auto-scaling infrastructure and served via a customizable API.
+#
+# Additional optimizations like speculative sampling and FP8 quantization can further improve throughput.
+# For more on the throughput levels that are possible with TensorRT-LLM for different combinations
+# of model, hardware, and workload, see the
+# [official benchmarks](https://github.com/NVIDIA/TensorRT-LLM/blob/71d8d4d3dc655671f32535d6d2b60cab87f36e87/docs/source/performance.md).
+#
+# ## Overview
+#
+# This guide is intended to document two things:
+# the general process for building TensorRT-LLM on Modal
+# and a specific configuration for serving the LLaMA 3 8B model.
+#
+# ### Build process
+#
+# Any given TensorRT-LLM service requires a multi-stage build process,
+# starting from model weights and ending with a compiled engine.
+# Because that process touches many sharp-edged high-performance components
+# across the stack, it can easily go wrong in subtle and hard-to-debug ways
+# that are idiosyncratic to specific systems.
+# And debugging GPU workloads is expensive!
+#
+# This example builds an entire service from scratch, from downloading weight tensors
+# to responding to requests, and so serves as living, interactive documentation of a TensorRT-LLM
+# build process that works on Modal.
+#
+# ### Engine configuration
+#
+# TensorRT-LLM is the Lamborghini of inference engines: it achieves seriously
+# impressive performance, but only if you tune it carefully.
+# We carefully document the choices we made here and point to additional resources
+# so you know where and how you might adjust the parameters for your use case.
+#
+# ## Installing TensorRT-LLM
+#
+# To run TensorRT-LLM, we must first install it. Easier said than done!
+#
+# In Modal, we define [container images](https://modal.com/docs/guide/custom-containers) that run our serverless workloads.
+# All Modal containers have access to GPU drivers via the underlying host environment,
+# but we still need to install the software stack on top of the drivers, from the CUDA runtime up.
+#
+# We start from the official `nvidia/cuda:12.1.1-devel-ubuntu22.04` image,
+# which includes the CUDA runtime & development libraries
+# and the environment configuration necessary to run them.
+
+import modal
+
+tensorrt_image = modal.Image.from_registry(
+    "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
+)
+
+# On top of that, we add some system dependencies of TensorRT-LLM,
+# including OpenMPI for distributed communication, some core software like `git`,
+# and the `tensorrt_llm` package itself.
+
+tensorrt_image = tensorrt_image.apt_install(
+    "openmpi-bin", "libopenmpi-dev", "git", "git-lfs", "wget"
+).pip_install(
+    "tensorrt_llm==0.10.0.dev2024042300",
+    pre=True,
+    extra_index_url="https://pypi.nvidia.com",
+)
+
+# Note that we're doing this by [method-chaining](https://quanticdev.com/articles/method-chaining/)
+# a number of calls to methods on the `modal.Image`. If you're familiar with
+# Dockerfiles, you can think of this as a Pythonic interface to instructions like `RUN` and `CMD`.
+#
+# End-to-end, this step takes five minutes.
+# If you're reading this from top to bottom,
+# you might want to stop here and execute the example
+# with `modal run trtllm_llama.py`
+# so that it runs in the background while you read the rest.
+#
+# ## Downloading the Model
+#
+# Next, we download the model we want to serve. In this case, we're using the instruction-tuned
+# version of Meta's Llama 3 8B model.
+# We use the function below to download the model from the Hugging Face Hub.
+
+MODEL_DIR = "/root/model/model_input"
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_REVISION = "7840f95a8c7a781d3f89c4818bf693431ab3119a"  # pin model revisions to prevent unexpected changes!
+
+
+def download_model():
+    import os
+
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    os.makedirs(MODEL_DIR, exist_ok=True)
+    snapshot_download(
+        MODEL_ID,
+        local_dir=MODEL_DIR,
+        ignore_patterns=["*.pt", "*.bin"],  # using safetensors
+        revision=MODEL_REVISION,
+    )
+    move_cache()
+
+
+# Just defining that function doesn't actually download the model, though.
+# We can run it by adding it to the image's build process with `run_function`.
+# The download process has its own dependencies, which we add here.
+
+MINUTES = 60  # seconds
+tensorrt_image = (  # update the image by downloading the model we're using
+    tensorrt_image.pip_install(  # add utilities for downloading the model
+        "hf-transfer==0.1.6",
+        "huggingface_hub==0.22.2",
+        "requests~=2.31.0",
+    )
+    .env(  # hf-transfer: faster downloads, but fewer comforts
+        {"HF_HUB_ENABLE_HF_TRANSFER": "1"}
+    )
+    .run_function(  # download the model
+        download_model,
+        timeout=20 * MINUTES,
+        secrets=[modal.Secret.from_name("huggingface-secret")],
+    )
+)
+
+# ## Configuring the model
+#
+# Now that we have the model downloaded, we need to convert it to a format that TensorRT-LLM can use.
+# We use a convenience script provided by the TensorRT-LLM team.
+# This script takes a few minutes to run.
+
+GIT_HASH = "71d8d4d3dc655671f32535d6d2b60cab87f36e87"
+CHECKPOINT_SCRIPT_URL = f"https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/{GIT_HASH}/examples/llama/convert_checkpoint.py"
+
+# TensorRT-LLM requires that a GPU be present to load the model, even though it isn't used directly during this conversion process.
+# We'll use a single A100-40GB GPU for this example, but we have also tested it successfully with A10G, A100-80GB, and H100 GPUs.
+#
+# The most important feature to track when selecting hardware to run on is GPU RAM:
+# larger models, longer sequences, and bigger batches all require more memory,
+# We tuned all three to maximize throughput on this example.
+#
+# The amount of GPU RAM on a single card is a tight constraint for most LLMs:
+# RAM is measured in tens of gigabytes and
+# models have billions of floating point parameters,
+# each consuming one to four bytes of memory.
+# The performance cliff if you need to spill to CPU memory is steep,
+# so the only solution is to split the model across multiple GPUs.
+# This is particularly important when serving larger models (e.g. 70B or 8x22B).
+
+N_GPUS = 1  # Heads up: this example has not yet been tested with multiple GPUs
+GPU_CONFIG = modal.gpu.A100(count=N_GPUS)
+
+# This is also the point where we specify the data type for this model.
+# We use IEEE 754-compliant half-precision floats, (`float16`), because we found that it resulted in marginally higher throughput,
+# but the model is provided in Google's
+# [`bfloat16` format](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format).
+# On the latest Ada Lovelace chips, you might use `float8` to reduce GPU RAM usage and speed up inference,
+# but note that the FP8 format is very new, so expect rough edges.
+
+DTYPE = "float16"
+
+# We put that all together with another invocation of `.run_commands`.
+
+CKPT_DIR = "/root/model/model_ckpt"
+tensorrt_image = (  # update the image by converting the model to TensorRT format
+    tensorrt_image.run_commands(  # takes ~5 minutes
+        [
+            f"wget {CHECKPOINT_SCRIPT_URL} -O /root/convert_checkpoint.py",
+            f"python /root/convert_checkpoint.py --model_dir={MODEL_DIR} --output_dir={CKPT_DIR}"
+            + f" --tp_size={N_GPUS} --dtype={DTYPE}",
+        ],
+        gpu=GPU_CONFIG,  # GPU must be present to load tensorrt_llm
+    )
+)
+
+# ## Compiling the engine
+#
+# TensorRT-LLM achieves its high throughput primarily by compiling the model:
+# making concrete choices of CUDA kernels to execute for each operation.
+# These kernels are much more specific than `matrix_multiply` or `softmax` --
+# they have names like `maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148t_nt`.
+# They are optimized for the specific types and shapes of tensors that the model uses
+# and for the specific hardware that the model runs on.
+#
+# That means we need to know all of that information a priori --
+# more like the original TensorFlow, which defined static graphs, than like PyTorch,
+# which builds up a graph of kernels dynamically at runtime.
+#
+# This extra layer of constraint on our LLM service is precisely
+# what allows TensorRT-LLM to achieve its high throughput.
+#
+# So we need to specify things like the maximum batch size and the lengths of inputs and outputs.
+# The closer these are to the actual values we'll use in production, the better the throughput we'll get.
+
+MAX_INPUT_LEN, MAX_OUTPUT_LEN = 256, 256
+MAX_BATCH_SIZE = (
+    128  # better throughput at larger batch sizes, limited by GPU RAM
+)
+ENGINE_DIR = "/root/model/model_output"
+
+SIZE_ARGS = f"--max_batch_size={MAX_BATCH_SIZE} --max_input_len={MAX_INPUT_LEN} --max_output_len={MAX_OUTPUT_LEN}"
+
+# There are many additional options you can pass to `trtllm-build` to tune the engine for your specific workload.
+# You can find the document we used for LLaMA
+# [here](https://github.com/NVIDIA/TensorRT-LLM/tree/66ef1df492f7bc9c8eeb01d7e14db01838e3f0bd/examples/llama),
+# which you can use to adjust the arguments to fit your workloads,
+# e.g. adjusting rotary embeddings and block sizes for longer contexts.
+#
+# We selected plugins that accelerate two core components of the model: dense matrix multiplication and attention.
+# You can read more about the plugin options [here](https://fetch.ai/blog/advancing-llm-optimization).
+
+PLUGIN_ARGS = f"--gemm_plugin={DTYPE} --gpt_attention_plugin={DTYPE}"
+
+# We put all of this together with another invocation of `.run_commands`.
+
+tensorrt_image = (  # update the image by building the TensorRT engine
+    tensorrt_image.run_commands(  # takes ~5 minutes
+        [
+            f"trtllm-build --checkpoint_dir {CKPT_DIR} --output_dir {ENGINE_DIR}"
+            + f" --tp_size={N_GPUS} --workers={N_GPUS}"
+            + f" {SIZE_ARGS}"
+            + f" {PLUGIN_ARGS}"
+        ],
+        gpu=GPU_CONFIG,  # TRT-LLM compilation is GPU-specific, so make sure this matches production!
+    ).env(  # show more log information from the inference engine
+        {"TLLM_LOG_LEVEL": "INFO"}
+    )
+)
+
+# ## Serving inference at thousands of tokens per second
+#
+# Now that we have the engine compiled, we can serve it with Modal by creating an `App`.
+
+app = modal.App(f"example-trtllm-{MODEL_ID}", image=tensorrt_image)
+
+# Thanks to our custom container runtime system, even this
+# large, many gigabyte container boots in seconds.
+#
+# At container start time, we boot up the engine, which completes in under 30 seconds.
+# Container starts are triggered when Modal scales up your infrastructure,
+# like the first time you run this code or the first time a request comes in after a period of inactivity.
+#
+# Container lifecycles in Modal are managed via our `Cls` interface, so we define one below
+# to manage the engine and run inference.
+# For details, see [this guide](https://modal.com/docs/guide/lifecycle-functions).
+
+
+@app.cls(
+    gpu=GPU_CONFIG,
+    secrets=[modal.Secret.from_name("huggingface-secret")],
+    container_idle_timeout=10 * MINUTES,
+)
+class Model:
+    @modal.enter()
+    def load(self):
+        """Loads the TRT-LLM engine and configures our tokenizer.
+
+        The @enter decorator ensures that it runs only once per container, when it starts."""
+        import time
+
+        print(
+            f"{COLOR['HEADER']}🥶 Cold boot: spinning up TRT-LLM engine{COLOR['ENDC']}"
+        )
+        self.init_start = time.monotonic_ns()
+
+        import tensorrt_llm
+        from tensorrt_llm.runtime import ModelRunner
+        from transformers import AutoTokenizer
+
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        # LLaMA models do not have a padding token, so we use the EOS token
+        self.tokenizer.add_special_tokens(
+            {"pad_token": self.tokenizer.eos_token}
+        )
+        # and then we add it from the left, to minimize impact on the output
+        self.tokenizer.padding_side = "left"
+        self.pad_id = self.tokenizer.pad_token_id
+        self.end_id = self.tokenizer.eos_token_id
+
+        runner_kwargs = dict(
+            engine_dir=f"{ENGINE_DIR}",
+            lora_dir=None,
+            rank=tensorrt_llm.mpi_rank(),  # this will need to be adjusted to use multiple GPUs
+        )
+
+        self.model = ModelRunner.from_dir(**runner_kwargs)
+
+        self.init_duration_s = (time.monotonic_ns() - self.init_start) / 1e9
+        print(
+            f"{COLOR['HEADER']}🚀 Cold boot finished in {self.init_duration_s}s{COLOR['ENDC']}"
+        )
+
+    @modal.method()
+    def generate(self, prompts: list[str], settings=None):
+        """Generate responses to a batch of prompts, optionally with custom inference settings."""
+        import time
+
+        if settings is None:
+            settings = dict(
+                temperature=0.1,  # temperature 0 not allowed, so we set top_k to 1 to get the same effect
+                top_k=1,
+                stop_words_list=None,
+                repetition_penalty=1.1,
+            )
+
+        settings[
+            "max_new_tokens"
+        ] = MAX_OUTPUT_LEN  # exceeding this will raise an error
+        settings["end_id"] = self.end_id
+        settings["pad_id"] = self.pad_id
+
+        num_prompts = len(prompts)
+
+        if num_prompts > MAX_BATCH_SIZE:
+            raise ValueError(
+                f"Batch size {num_prompts} exceeds maximum of {MAX_BATCH_SIZE}"
+            )
+
+        print(
+            f"{COLOR['HEADER']}🚀 Generating completions for batch of size {num_prompts}...{COLOR['ENDC']}"
+        )
+        start = time.monotonic_ns()
+
+        parsed_prompts = [
+            self.tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            for prompt in prompts
+        ]
+
+        print(
+            f"{COLOR['HEADER']}Parsed prompts:{COLOR['ENDC']}",
+            *parsed_prompts,
+            sep="\n\t",
+        )
+
+        inputs_t = self.tokenizer(
+            parsed_prompts, return_tensors="pt", padding=True, truncation=False
+        )["input_ids"]
+
+        print(
+            f"{COLOR['HEADER']}Input tensors:{COLOR['ENDC']}", inputs_t[:, :8]
+        )
+
+        outputs_t = self.model.generate(inputs_t, **settings)
+
+        outputs_text = self.tokenizer.batch_decode(
+            outputs_t[:, 0]
+        )  # only one output per input, so we index with 0
+
+        responses = [
+            extract_assistant_response(output_text)
+            for output_text in outputs_text
+        ]
+        duration_s = (time.monotonic_ns() - start) / 1e9
+
+        num_tokens = sum(
+            map(lambda r: len(self.tokenizer.encode(r)), responses)
+        )
+
+        for prompt, response in zip(prompts, responses):
+            print(
+                f"{COLOR['HEADER']}{COLOR['GREEN']}{prompt}",
+                f"\n{COLOR['BLUE']}{response}",
+                "\n\n",
+                sep=COLOR["ENDC"],
+            )
+            time.sleep(0.01)  # to avoid log truncation
+
+        print(
+            f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {MODEL_ID} in {duration_s:.1f} seconds,"
+            f" throughput = {num_tokens / duration_s:.0f} tokens/second for batch of size {num_prompts} on {GPU_CONFIG}.{COLOR['ENDC']}"
+        )
+
+        return responses
+
+
+# ## Calling our inference function
+#
+# Now, how do we actually run the model?
+#
+# There are two basic methods: from Python via our SDK or from anywhere, by setting up an API.
+#
+# ### Calling inference from Python
+#
+# To run our `Model`'s `.generate` method from Python, we just need to call it --
+# with `.remote` appended to run it on Modal.
+#
+# We wrap that logic in a `local_entrypoint` so you can run it from the command line with
+# ```bash
+# modal run trtllm_llama.py
+# ```
+#
+# For simplicity, we hard-code a batch of 128 questions to ask the model.
+
+
+@app.local_entrypoint()
+def main():
+    questions = [
+        # Generic assistant questions
+        "What are you?",
+        "What can you do?",
+        # Coding
+        "Implement a Python function to compute the Fibonacci numbers.",
+        "Write a Rust function that performs binary exponentiation.",
+        "How do I allocate memory in C?",
+        "What are the differences between Javascript and Python?",
+        "How do I find invalid indices in Postgres?",
+        "How can you implement a LRU (Least Recently Used) cache in Python?",
+        "What approach would you use to detect and prevent race conditions in a multithreaded application?",
+        "Can you explain how a decision tree algorithm works in machine learning?",
+        "How would you design a simple key-value store database from scratch?",
+        "How do you handle deadlock situations in concurrent programming?",
+        "What is the logic behind the A* search algorithm, and where is it used?",
+        "How can you design an efficient autocomplete system?",
+        "What approach would you take to design a secure session management system in a web application?",
+        "How would you handle collision in a hash table?",
+        "How can you implement a load balancer for a distributed system?",
+        "Implement a Python class for a doubly linked list.",
+        "Write a Haskell function that generates prime numbers using the Sieve of Eratosthenes.",
+        "Develop a simple HTTP server in Rust.",
+        # Literate and creative writing
+        "What is the fable involving a fox and grapes?",
+        "Who does Harry turn into a balloon?",
+        "Write a story in the style of James Joyce about a trip to the Australian outback in 2083 to see robots in the beautiful desert.",
+        "Write a tale about a time-traveling historian who's determined to witness the most significant events in human history.",
+        "Describe a day in the life of a secret agent who's also a full-time parent.",
+        "Create a story about a detective who can communicate with animals.",
+        "What is the most unusual thing about living in a city floating in the clouds?",
+        "In a world where dreams are shared, what happens when a nightmare invades a peaceful dream?",
+        "Describe the adventure of a lifetime for a group of friends who found a map leading to a parallel universe.",
+        "Tell a story about a musician who discovers that their music has magical powers.",
+        "In a world where people age backwards, describe the life of a 5-year-old man.",
+        "Create a tale about a painter whose artwork comes to life every night.",
+        "What happens when a poet's verses start to predict future events?",
+        "Imagine a world where books can talk. How does a librarian handle them?",
+        "Tell a story about an astronaut who discovered a planet populated by plants.",
+        "Describe the journey of a letter traveling through the most sophisticated postal service ever.",
+        "Write a tale about a chef whose food can evoke memories from the eater's past.",
+        "Write a poem in the style of Walt Whitman about the modern digital world.",
+        "Create a short story about a society where people can only speak in metaphors.",
+        "What are the main themes in Dostoevsky's 'Crime and Punishment'?",
+        # History and Philosophy
+        "What were the major contributing factors to the fall of the Roman Empire?",
+        "How did the invention of the printing press revolutionize European society?",
+        "What are the effects of quantitative easing?",
+        "How did the Greek philosophers influence economic thought in the ancient world?",
+        "What were the economic and philosophical factors that led to the fall of the Soviet Union?",
+        "How did decolonization in the 20th century change the geopolitical map?",
+        "What was the influence of the Khmer Empire on Southeast Asia's history and culture?",
+        "What led to the rise and fall of the Mongol Empire?",
+        "Discuss the effects of the Industrial Revolution on urban development in 19th century Europe.",
+        "How did the Treaty of Versailles contribute to the outbreak of World War II?",
+        "What led to the rise and fall of the Mongol Empire?",
+        "Discuss the effects of the Industrial Revolution on urban development in 19th century Europe.",
+        "How did the Treaty of Versailles contribute to the outbreak of World War II?",
+        "Explain the concept of 'tabula rasa' in John Locke's philosophy.",
+        "What does Nietzsche mean by 'ressentiment'?",
+        "Compare and contrast the early and late works of Ludwig Wittgenstein. Which do you prefer?",
+        "How does the trolley problem explore the ethics of decision-making in critical situations?",
+        # Thoughtfulness
+        "Describe the city of the future, considering advances in technology, environmental changes, and societal shifts.",
+        "In a dystopian future where water is the most valuable commodity, how would society function?",
+        "If a scientist discovers immortality, how could this impact society, economy, and the environment?",
+        "What could be the potential implications of contact with an advanced alien civilization?",
+        "Describe how you would mediate a conflict between two roommates about doing the dishes using techniques of non-violent communication.",
+        "If you could design a school curriculum for the future, what subjects would you include to prepare students for the next 50 years?",
+        "How would society change if teleportation was invented and widely accessible?",
+        "Consider a future where artificial intelligence governs countries. What are the potential benefits and pitfalls?",
+        # Math
+        "What is the product of 9 and 8?",
+        "If a train travels 120 kilometers in 2 hours, what is its average speed?",
+        "Think through this step by step. If the sequence a_n is defined by a_1 = 3, a_2 = 5, and a_n = a_(n-1) + a_(n-2) for n > 2, find a_6.",
+        "Think through this step by step. Calculate the sum of an arithmetic series with first term 3, last term 35, and total terms 11.",
+        "Think through this step by step. What is the area of a triangle with vertices at the points (1,2), (3,-4), and (-2,5)?",
+        "Think through this step by step. Solve the following system of linear equations: 3x + 2y = 14, 5x - y = 15.",
+        # Facts
+        "Who was Emperor Norton I, and what was his significance in San Francisco's history?",
+        "What is the Voynich manuscript, and why has it perplexed scholars for centuries?",
+        "What was Project A119 and what were its objectives?",
+        "What is the 'Dyatlov Pass incident' and why does it remain a mystery?",
+        "What is the 'Emu War' that took place in Australia in the 1930s?",
+        "What is the 'Phantom Time Hypothesis' proposed by Heribert Illig?",
+        "Who was the 'Green Children of Woolpit' as per 12th-century English legend?",
+        "What are 'zombie stars' in the context of astronomy?",
+        "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?",
+        "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?",
+        "Which countries in the European Union use currencies other than the Euro, and what are those currencies?",
+        # Multilingual
+        "战国时期最重要的人物是谁?",
+        "Tuende hatua kwa hatua. Hesabu jumla ya mfululizo wa kihesabu wenye neno la kwanza 2, neno la mwisho 42, na jumla ya maneno 21.",
+        "Kannst du die wichtigsten Eigenschaften und Funktionen des NMDA-Rezeptors beschreiben?",
+        "¿Cuáles son los principales impactos ambientales de la deforestación en la Amazonía?",
+        "Décris la structure et le rôle de la mitochondrie dans une cellule.",
+        "Какие были социальные последствия Перестройки в Советском Союзе?",
+        # Economics and Business
+        "What are the principles of behavioral economics and how do they influence consumer choices?",
+        "Discuss the impact of blockchain technology on traditional banking systems.",
+        "What are the long-term effects of trade wars on global economic stability?",
+        "What is the law of supply and demand?",
+        "Explain the concept of inflation and its typical causes.",
+        "What is a trade deficit, and why does it matter?",
+        "How do interest rates affect consumer spending and saving?",
+        "What is GDP and why is it important for measuring economic health?",
+        "What is the difference between revenue and profit?",
+        "Describe the role of a business plan in startup success.",
+        "How does market segmentation benefit a company?",
+        "Explain the concept of brand equity.",
+        "What are the advantages of franchising a business?",
+        "What are Michael Porter's five forces and how do they impact strategy for tech startups?",
+        # Science and Technology
+        "Discuss the potential impacts of quantum computing on data security.",
+        "How could CRISPR technology change the future of medical treatments?",
+        "Explain the significance of graphene in the development of future electronics.",
+        "How do renewable energy sources compare to fossil fuels in terms of environmental impact?",
+        "What are the most promising technologies for carbon capture and storage?",
+        "Explain why the sky is blue.",
+        "What is the principle behind the operation of a microwave oven?",
+        "How does Newton's third law apply to rocket propulsion?",
+        "What causes iron to rust?",
+        "Describe the process of photosynthesis in simple terms.",
+        "What is the role of a catalyst in a chemical reaction?",
+        "What is the basic structure of a DNA molecule?",
+        "How do vaccines work to protect the body from disease?",
+        "Explain the significance of mitosis in cellular reproduction.",
+        "What are tectonic plates and how do they affect earthquakes?",
+        "How does the greenhouse effect contribute to global warming?",
+        "Describe the water cycle and its importance to Earth's climate.",
+        "What causes the phases of the Moon?",
+        "How do black holes form?",
+        "Explain the significance of the Big Bang theory.",
+        "What is the function of the CPU in a computer system?",
+        "Explain the difference between RAM and ROM.",
+        "How does a solid-state drive (SSD) differ from a hard disk drive (HDD)?",
+        "What role does the motherboard play in a computer system?",
+        "Describe the purpose and function of a GPU.",
+        "What is TensorRT? What role does it play in neural network inference?",
+    ]
+
+    model = Model()
+    model.generate.remote(questions)
+    # if you're calling this service from another Python project,
+    # use [`Model.lookup`](https://modal.com/docs/reference/modal.Cls#lookup)
+
+
+# ### Calling inference via an API
+#
+# We can use `modal.web_endpoint` and `app.function` to turn any Python function into a web API.
+#
+# This API wrapper doesn't need all the dependencies of the core inference service,
+# so we switch images here to a basic Linux image, `debian_slim`, which has everything we need.
+
+web_image = modal.Image.debian_slim(python_version="3.10")
+
+# From there, we can take the same remote generation logic we used in `main`
+# and serve it with only a few more lines of code.
+
+
+@app.function(image=web_image)
+@modal.web_endpoint(method="POST")
+def generate_web(data: dict):
+    return Model.generate.remote(data["prompts"], settings=None)
+
+
+# To set our function up as a web endpoint, we need to run this file --
+# with `modal serve` to create a hot-reloading development server or `modal deploy` to deploy it to production.
+#
+# ```bash
+# modal serve trtllm_llama.py
+# ```
+#
+# You can test the endpoint by sending a POST request with `curl` from another terminal:
+#
+# ```bash
+# curl -X POST url-from-output-of-modal-serve-here \
+# -H "Content-Type: application/json" \
+# -d '{
+#     "prompts": ["Tell me a joke", "Describe a dream you had recently", "Share your favorite childhood memory"]
+# }' | python -m json.tool # python for pretty-printing, optional
+# ```
+#
+# And now you have a high-throughput, low-latency, autoscaling API for serving LLaMA 3 8B completions!
+#
+# ## Footer
+#
+# The rest of the code in this example is utility code.
+
+
+COLOR = {
+    "HEADER": "\033[95m",
+    "BLUE": "\033[94m",
+    "GREEN": "\033[92m",
+    "RED": "\033[91m",
+    "ENDC": "\033[0m",
+}
+
+
+def extract_assistant_response(output_text):
+    """Model-specific code to extract model responses.
+
+    See this doc for LLaMA 3: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/."""
+    # Split the output text by the assistant header token
+    parts = output_text.split("<|start_header_id|>assistant<|end_header_id|>")
+
+    if len(parts) > 1:
+        # Join the parts after the first occurrence of the assistant header token
+        response = parts[1].split("<|eot_id|>")[0].strip()
+
+        # Remove any remaining special tokens and whitespace
+        response = response.replace("<|eot_id|>", "").strip()
+
+        return response
+    else:
+        return output_text

From 45f75b7377139d168b627e147669965a49749d87 Mon Sep 17 00:00:00 2001
From: Minki Jung <113061064+jung0072@users.noreply.github.com>
Date: Thu, 25 Apr 2024 17:57:27 -0400
Subject: [PATCH 02/17] add python version to the image to prevent pip install
 error (#715)

---
 06_gpu_and_ml/llm-serving/vllm_gemma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py
index 9cefef401..2a3545961 100644
--- a/06_gpu_and_ml/llm-serving/vllm_gemma.py
+++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py
@@ -76,7 +76,7 @@ def download_model_to_image(model_dir, model_name):
 # and save the resulting files to the container image -- that way we don't need
 # to redownload the weights every time we change the server's code or start up more instances of the server.
 image = (
-    modal.Image.debian_slim()
+    modal.Image.debian_slim(python_version="3.10")
     .pip_install(
         "vllm==0.4.0.post1",
         "torch==2.1.2",

From 0e6f4d817136e9746fd017b587ba14744673b9b2 Mon Sep 17 00:00:00 2001
From: Alonso Astroza Tagle <alonsoastroza@gmail.com>
Date: Sun, 28 Apr 2024 23:50:46 -0400
Subject: [PATCH 03/17] python_version in Image declaration (#720)

---
 06_gpu_and_ml/llm-serving/vllm_inference.py | 2 +-
 06_gpu_and_ml/llm-serving/vllm_mixtral.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py
index c3d0375c4..9f6b23a6a 100644
--- a/06_gpu_and_ml/llm-serving/vllm_inference.py
+++ b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -56,7 +56,7 @@ def download_model_to_image(model_dir, model_name):
 # We’ll start from Modal's Debian slim image.
 # Then we’ll use `run_function` with `download_model_to_image` to write the model into the container image.
 image = (
-    modal.Image.debian_slim()
+    modal.Image.debian_slim(python_version="3.10")
     .pip_install(
         "vllm==0.4.0.post1",
         "torch==2.1.2",
diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py
index 9bb9471ba..57618ae28 100644
--- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py
+++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py
@@ -63,7 +63,7 @@ def download_model_to_image(model_dir, model_name, model_revision):
 # the model are saved within the container image.
 
 vllm_image = (
-    modal.Image.debian_slim()
+    modal.Image.debian_slim(python_version="3.10")
     .pip_install(
         "vllm==0.4.0.post1",
         "torch==2.1.2",

From dabdf155706bf8f56b3a9b4ebc15afedd95030a3 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Sun, 28 Apr 2024 22:45:00 -0700
Subject: [PATCH 04/17] Clean up instructor example (#722)

* removes extra inference file

* cleans up instructor example
---
 .../llm-structured/instructor/inference.py    |  79 ---------
 .../instructor/instructor_generate.py         | 161 ++++++++++--------
 2 files changed, 90 insertions(+), 150 deletions(-)
 delete mode 100644 06_gpu_and_ml/llm-structured/instructor/inference.py

diff --git a/06_gpu_and_ml/llm-structured/instructor/inference.py b/06_gpu_and_ml/llm-structured/instructor/inference.py
deleted file mode 100644
index d8e765764..000000000
--- a/06_gpu_and_ml/llm-structured/instructor/inference.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# # Fast inference with vLLM (Mistral 7B)
-#
-# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm)
-# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching.
-
-import os
-import subprocess
-
-from modal import App, Image, Secret, gpu, web_server
-
-MODEL_DIR = "/model"
-BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
-
-
-# ## Define a container image
-
-
-# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
-# is that the container no longer has to re-download the model from Huggingface - instead, it will take
-# advantage of Modal's internal filesystem for faster cold starts.
-#
-# ### Download the weights
-# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
-#
-# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
-def download_model_to_folder():
-    from huggingface_hub import snapshot_download
-    from transformers.utils import move_cache
-
-    os.makedirs(MODEL_DIR, exist_ok=True)
-
-    snapshot_download(
-        BASE_MODEL,
-        local_dir=MODEL_DIR,
-        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
-    )
-    move_cache()
-
-
-# ### Image definition
-# We'll start from a recommended Docker Hub image and install `vLLM`.
-# Then we'll use `run_function` to run the function defined above to ensure the weights of
-# the model are saved within the container image.
-image = (
-    Image.from_registry(
-        "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
-    )
-    .pip_install(
-        "vllm==0.2.5",
-        "huggingface_hub==0.19.4",
-        "hf-transfer==0.1.4",
-        "torch==2.1.2",
-    )
-    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
-    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
-    .run_function(
-        download_model_to_folder,
-        secrets=[Secret.from_name("huggingface")],
-        timeout=60 * 20,
-    )
-)
-
-app = App(
-    "vllm-inference-openai-compatible", image=image
-)  # Note: prior to April 2024, "app" was called "stub"
-
-
-GPU_CONFIG = gpu.A100(count=1)  # 40GB A100 by default
-
-
-@app.function(
-    allow_concurrent_inputs=100,
-    gpu=GPU_CONFIG,
-)
-@web_server(8000, startup_timeout=90)
-def openai_compatible_server():
-    target = BASE_MODEL
-    cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --host 0.0.0.0 --port 8000"
-    subprocess.Popen(cmd, shell=True)
diff --git a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py
index ca6d66fec..242f419d3 100644
--- a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py
+++ b/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py
@@ -3,47 +3,46 @@
 # ---
 # # Structured Data Extraction using `instructor`
 #
-# This example demonstrates how to use the `instructor` library to extract structured data from unstructured text.
+# This example demonstrates how to use the `instructor` library to extract structured, schematized data from unstructured text.
 #
-# Structured output is a powerful but under-appreciated feature of LLMs,
-# because it makes it easier to connect LLMs to other software,
-# for example enabling the ingestion of unstructured data into structured databases.
+# Structured output is a powerful but under-appreciated feature of LLMs.
+# Structured output allows LLMs and multimodal models to connect to traditional software,
+# for example enabling the ingestion of unstructured data like text files into structured databases.
+# Applied properly, it makes them an extreme example of the [Robustness Principle](https://en.wikipedia.org/wiki/Robustness_principle)
+# Jon Postel formulated for TCP: "Be conservative in what you send, be liberal in what you accept".
 #
-# The unstructured data in this example is the code from the examples in the Modal examples repository --
-# including this one!
-#
-# We use this exact code to monitor the coverage of the examples
-# and to make decisions about which examples to write next!
+# The unstructured data used in this example code is the code from the examples in the Modal examples repository --
+# including this example's code!
 #
 # The output includes a JSONL file containing, on each line, the metadata extracted from the code in one example.
 # This can be consumed downstream by other software systems, like a database or a dashboard.
 #
-# We include in this folder a Jupyter notebook with some basic analyses.
-#
 # ## Environment setup
 #
-# We setup the environment our code will run in first.
+# We set up the environment our code will run in first.
 # In Modal, we define environments via [container images](https://modal.com/docs/guide/custom-container),
 # much like Docker images, by iteratively chaining together commands.
 #
-# This example also uses models from Anthropic, so if you want to run it yourself,
-# you'll need to set up a Modal [`Secret`](https://modal.com/docs/guide/secrets)
-# called `my-anthropic-secret` for your OpenAI API key.
+# Here there's just one command, installing instructor and the Python SDK for Anthropic's LLM API.
 from pathlib import Path
 from typing import Literal, Optional
 
 import modal
+from pydantic import BaseModel, Field
 
 image = modal.Image.debian_slim(python_version="3.11").pip_install(
-    "instructor~=1.0.0", "anthropic~=0.23.1", "matplotlib~=3.8.3"
+    "instructor~=1.0.0", "anthropic~=0.23.1"
 )
 
+# This example uses models from Anthropic, so if you want to run it yourself,
+# you'll need to set up a Modal [`Secret`](https://modal.com/docs/guide/secrets)
+# called `my-anthropic-secret` for your OpenAI API key.
+
 app = modal.App(
     image=image, secrets=[modal.Secret.from_name("my-anthropic-secret")]
 )  # Note: prior to April 2024, "app" was called "stub"
 
-
-# ## The overall flow
+# ## Running Modal functions from the command line
 #
 # We'll run the example by calling `modal run instructor_generate.py` from the command line.
 #
@@ -64,7 +63,7 @@
 
 
 @app.local_entrypoint()
-def main(limit: int = 15, with_opus: bool = False):
+def main(limit: int = 1, with_opus: bool = False):
     # find all of the examples in the repo
     examples = get_examples()
     # optionally limit the number of examples we process
@@ -72,17 +71,17 @@ def main(limit: int = 15, with_opus: bool = False):
         examples = [None]  # just run on this example
     else:
         examples = examples[:limit]
-    if examples:
-        # use Modal to map our extraction function over the examples concurrently
-        results = extract_example_metadata.map(
-            [
-                f"{example.stem}\n" + Path(example.filename).read_text()
-                if example
-                else None
-                for example in examples
-            ],
-            kwargs={"with_opus": with_opus},
-        )
+    # use Modal to map our extraction function over the examples concurrently
+    results = extract_example_metadata.map(
+        (  # iterable of file contents
+            Path(example.filename).read_text() if example else None
+            for example in examples
+        ),
+        (  # iterable of filenames
+            example.stem if example else None for example in examples
+        ),
+        kwargs={"with_opus": with_opus},
+    )
 
     # save the results to a local file
     results_path = Path("/tmp") / "instructor_generate" / "results.jsonl"
@@ -97,15 +96,65 @@ def main(limit: int = 15, with_opus: bool = False):
             f.write(result + "\n")
 
 
-# ## Extracting JSON from unstructured text with `instructor`
+# ## Extracting JSON from unstructured text with `instructor` and Pydantic
 #
-# The real meat of this example is here, in the `extract_example_metadata` function.
+# The real meat of this example is in this section, in the `extract_example_metadata` function and its schemas.
 #
-# TODO: write this up
-# TODO: refactor classes out of this function, explain separately
+# We define a schema for the data we want the LLM to extract, using Pydantic.
+# Instructor ensures that the LLM's output matches this schema.
+#
+# We can use the type system provided by Python and Pydantic to express many useful features
+# of the data we want to extract -- ranging from wide-open fields like a `str`ing-valued `summary`
+# to constrained fields like `difficulty`, which can only take on value between 1 and 5.
+
 
+class ExampleMetadataExtraction(BaseModel):
+    """Extracted metadata about an example from the Modal examples repo."""
 
-@app.function(concurrency_limit=5)  # watch those rate limits!
+    summary: str = Field(..., description="A brief summary of the example.")
+    has_thorough_explanation: bool = Field(
+        ...,
+        description="The example contains, in the form of inline comments with markdown formatting, a thorough explanation of what the code does.",
+    )
+    domains: list[
+        Literal[
+            "artificial_intelligence",
+            "machine_learning",
+            "data_science",
+            "web_serving",
+            "parallel_computing",
+        ]
+    ] = Field(..., description="The")
+    difficulty: Literal[1, 2, 3, 4, 5] = Field(
+        ...,
+        description="The difficulty of the example, from 1 to 5. An example that uses only one or two basic Modal features and is understandable by a professional Python developer familiar with the basics of the relevant domains is a 1, while an example that uses many Modal features and uses advanced Python features like async generator coroutines or metaclasses is a 5.",
+    )
+    freshness: float = Field(
+        ...,
+        description="The freshness of the example, from 0 to 1. This is relative to your knowledge cutoff. Examples are less fresh if they use older libraries and tools.",
+    )
+
+
+# That schema describes the data to be extracted by the LLM, but not all data is best extracted by an LLM.
+# For example, the filename is easily determined in software.
+#
+# So we inject that information into the output after the LLM has done its work. That necessitates
+# an additional schema, which inherits from the first.
+
+
+class ExampleMetadata(ExampleMetadataExtraction):
+    """Metadata about an example from the Modal examples repo."""
+
+    filename: Optional[str] = Field(
+        ..., description="The filename of the example."
+    )
+
+
+# With these schemas in hand, it's straightforward to write the function that extracts the metadata.
+# Note that we decorate it with `@app.function` to make it run on Modal.
+
+
+@app.function(concurrency_limit=5)  # watch those LLM API rate limits!
 def extract_example_metadata(
     example_contents: Optional[str] = None,
     filename: Optional[str] = None,
@@ -113,47 +162,16 @@ def extract_example_metadata(
 ):
     import instructor
     from anthropic import Anthropic
-    from pydantic import BaseModel, Field
 
+    # if no example is provided, use the contents of this example
     if example_contents is None:
         example_contents = Path(__file__).read_text()
         filename = Path(__file__).name
 
-    class ExampleMetadataExtraction(BaseModel):
-        """Extracted metadata about an example from the Modal examples repo."""
-
-        summary: str = Field(..., description="A brief summary of the example.")
-        has_thorough_explanation: bool = Field(
-            ...,
-            description="The example contains, in the form of inline comments with markdown formatting, a thorough explanation of what the code does.",
-        )
-        domains: list[
-            Literal[
-                "artificial_intelligence",
-                "machine_learning",
-                "data_science",
-                "web_serving",
-                "parallel_computing",
-            ]
-        ] = Field(..., description="The")
-        difficulty: Literal[1, 2, 3, 4, 5] = Field(
-            ...,
-            description="The difficulty of the example, from 1 to 5. An example that uses only one or two basic Modal features and is understandable by a professional Python developer familiar with the basics of the relevant domains is a 1, while an example that uses many Modal features and uses advanced Python features like async generator coroutines or metaclasses is a 5.",
-        )
-        freshness: float = Field(
-            ...,
-            description="The freshness of the example, from 0 to 1. This is relative to your knowledge cutoff. Examples are less fresh if they use older libraries and tools.",
-        )
-
-    class ExampleMetadata(ExampleMetadataExtraction):
-        """Metadata about an example from the Modal examples repo."""
-
-        filename: str = Field(..., description="The filename of the example.")
-
     client = instructor.from_anthropic(Anthropic())
-
     model = "claude-3-opus-20240229" if with_opus else "claude-3-haiku-20240307"
 
+    # add the schema as the `response_model` argument in what otherwise looks like a normal LLM API call
     extracted_metadata = client.messages.create(
         model=model,
         temperature=0.0,
@@ -167,18 +185,19 @@ class ExampleMetadata(ExampleMetadataExtraction):
         ],
     )
 
+    # inject the filename
     full_metadata = ExampleMetadata(
         **extracted_metadata.dict(), filename=filename
     )
 
+    # return it as JSON
     return full_metadata.model_dump_json()
 
 
 # ## Addenda
 #
 # The rest of the code used in this example is not particularly interesting:
-# some boilerplate matplotlib code to generate the figures,
-# and a utility function to find all of the examples.
+# just a utility function to find all of the examples, which we invoke in the `local_entrypoint` above.
 
 
 def get_examples(silent=True):
@@ -195,7 +214,7 @@ def get_examples(silent=True):
     spec.loader.exec_module(example_utils)
     examples = [
         example
-        for example in example_utils.get_examples(silent=silent)
+        for example in example_utils.get_examples()
         if example.type != 2  # filter out non-code assets
     ]
     return examples

From d35dd4386b8c5248f5e83e2899f9177fa6cdb1fc Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Sun, 28 Apr 2024 23:24:26 -0700
Subject: [PATCH 05/17] resurrect blender (#723)

* adds a refreshed blender example

* adds gif of final render
---
 06_gpu_and_ml/blender/blender_video.py | 296 +++++++++++++++++++++++++
 1 file changed, 296 insertions(+)
 create mode 100644 06_gpu_and_ml/blender/blender_video.py

diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py
new file mode 100644
index 000000000..f5cc55622
--- /dev/null
+++ b/06_gpu_and_ml/blender/blender_video.py
@@ -0,0 +1,296 @@
+# ---
+# output-directory: "/tmp/render"
+# ---
+# # Render a video with Blender on many GPUs or CPUs in parallel
+#
+# This example shows how you can render an animated 3D scene using
+# [Blender](https://www.blender.org/)'s Python interface.
+#
+# You can run it on CPUs to scale out on one hundred of containers
+# or run it on GPUs to get higher throughput per node.
+# Even with this simple scene, GPUs render 2x faster than CPUs.
+#
+# The final render looks something like this:
+#
+# ![Spinning Modal logo](https://modal-public-assets.s3.amazonaws.com/modal-blender-render.gif)
+#
+# ## Defining a Modal app
+
+import io
+import math
+from pathlib import Path
+
+import modal
+
+# Modal runs your Python functions for you in the cloud.
+# You organize your code into apps, collections of functions that work together.
+
+app = modal.App("examples-blender-logo")
+
+# We need to define the environment each function runs in --  its container image.
+# The block below defines a container image, starting from a basic Debian Linux image
+# adding Blender's system-level dependencies
+# and then installing the `bpy` package, which is Blender's Python API.
+
+rendering_image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("xorg", "libxkbcommon0")  # X11 (Unix GUI) dependencies
+    .pip_install("bpy")  # Blender as a Python package
+)
+
+# ## Rendering a single frame
+#
+# We define a function that renders a single frame. We'll scale this function out on Modal later.
+#
+# Functions in Modal are defined along with their hardware and their dependencies.
+# This function can be run with GPU acceleration or without it, and we'll use a global flag in the code to switch between the two.
+
+WITH_GPU = True  # try changing this to False to run rendering massively in parallel on CPUs!
+
+# We decorate the function with `@app.function` to define it as a Modal function.
+# Note that in addition to defining the hardware requirements of the function,
+# we also specify the container image that the function runs in (the one we defined above).
+
+# The details of the rendering function aren't too important for this example,
+# so we abstract them out into functions defined at the end of the file.
+# We draw a simple version of the Modal logo:
+# two neon green rectangular prisms facing different directions.
+# We include a parameter to rotate the prisms around the vertical/Z axis,
+# which we'll use to animate the logo.
+
+
+@app.function(
+    gpu="T4" if WITH_GPU else None,
+    concurrency_limit=10
+    if WITH_GPU
+    else 100,  # default limits on Modal free tier
+    image=rendering_image,
+)
+def render(angle: int = 0) -> bytes:
+    """
+    Renders Modal's logo, two neon green rectangular prisms.
+
+
+    Args:
+        angle: How much to rotate the two prisms around the vertical/Z axis, in degrees.
+
+    Returns:
+        The rendered frame as a PNG image.
+    """
+    import bpy
+
+    # clear existing objects
+    bpy.ops.object.select_all(action="DESELECT")
+    bpy.ops.object.select_by_type(type="MESH")
+    bpy.ops.object.delete()
+
+    # ctx: the current Blender state, which we mutate
+    ctx = bpy.context
+
+    # scene: the 3D environment we are rendering and its camera(s)
+    scene = ctx.scene
+
+    # configure rendering -- CPU or GPU, resolution, etc.
+    # see function definition below for details
+    configure_rendering(ctx, WITH_GPU)
+
+    scene.render.image_settings.file_format = "PNG"
+    scene.render.filepath = "output.png"
+
+    # set background to black
+    black = (0, 0, 0, 1)
+    scene.world.node_tree.nodes["Background"].inputs[0].default_value = black
+
+    # add the Modal logo: two neon green rectangular prisms
+    iridescent_material = create_iridescent_material()
+
+    add_prism(ctx, (-1, 0, 0), 45, angle, iridescent_material)
+    add_prism(ctx, (3, 0, 0), -45, angle, iridescent_material)
+
+    # set up the lighting and camera
+    bpy.ops.object.light_add(type="POINT", location=(5, 5, 5))
+    bpy.context.object.data.energy = 10
+    bpy.ops.object.camera_add(location=(7, -7, 5))
+    scene.camera = bpy.context.object
+    ctx.object.rotation_euler = (1.1, 0, 0.785)
+
+    # render
+    bpy.ops.render.render(write_still=True)
+
+    # return the bytes to the caller
+    with open(scene.render.filepath, "rb") as image_file:
+        image_bytes = image_file.read()
+
+    return image_bytes
+
+
+# ### Rendering with acceleration
+#
+# We can configure the rendering process to use GPU acceleration with NVIDIA CUDA.
+# We select the [Cycles rendering engine](https://www.cycles-renderer.org/), which is compatible with CUDA,
+# and then activate the GPU.
+
+
+def configure_rendering(ctx, with_gpu: bool):
+    # configure the rendering process
+    ctx.scene.render.engine = "CYCLES"
+    ctx.scene.render.resolution_x = 1920
+    ctx.scene.render.resolution_y = 1080
+    ctx.scene.render.resolution_percentage = 100
+
+    # add GPU acceleration if available
+    if with_gpu:
+        ctx.preferences.addons[
+            "cycles"
+        ].preferences.compute_device_type = "CUDA"
+        ctx.scene.cycles.device = "GPU"
+
+        # reload the devices to update the configuration
+        ctx.preferences.addons["cycles"].preferences.get_devices()
+        for device in ctx.preferences.addons["cycles"].preferences.devices:
+            device.use = True
+
+    else:
+        ctx.scene.cycles.device = "CPU"
+
+    # report rendering devices -- a nice snippet for debugging and ensuring the accelerators are being used
+    for dev in ctx.preferences.addons["cycles"].preferences.devices:
+        print(
+            f"ID:{dev['id']} Name:{dev['name']} Type:{dev['type']} Use:{dev['use']}"
+        )
+
+
+# ## Combining frames into a GIF
+#
+# Rendering 3D images is fun, and GPUs can make it faster, but rendering 3D videos is better!
+# We add another function to our app, running on a different, simpler container image
+# and different hardware, to combine the frames into a GIF.
+
+combination_image = modal.Image.debian_slim(python_version="3.11").pip_install(
+    "pillow==10.3.0"
+)
+
+# The video has a few parameters, which we set here.
+
+FPS = 60
+FRAME_DURATION_MS = 1000 // FPS
+NUM_FRAMES = 360  # drop this for faster iteration while playing around
+
+# The function to combine the frames into a GIF takes a sequence of byte sequences, one for each rendered frame,
+# and converts them into a single sequence of bytes, the GIF.
+
+
+@app.function(image=combination_image)
+def combine(
+    frames_bytes: list[bytes], frame_duration: int = FRAME_DURATION_MS
+) -> bytes:
+    print("🎞️ combining frames into a gif")
+    from PIL import Image
+
+    frames = [
+        Image.open(io.BytesIO(frame_bytes)) for frame_bytes in frames_bytes
+    ]
+
+    gif_image = io.BytesIO()
+    frames[0].save(
+        gif_image,
+        format="GIF",
+        save_all=True,
+        append_images=frames[1:],
+        duration=frame_duration,
+        loop=0,
+    )
+
+    gif_image.seek(0)
+
+    return gif_image.getvalue()
+
+
+# ## Rendering in parallel in the cloud from the comfort of the command line
+#
+# With these two functions defined, we need only a few more lines to run our rendering at scale on Modal.
+#
+# First, we need a function that coordinates our functions to `render` frames and `combine` them.
+# We decorate that function with `@app.local_entrypoint` so that we can run it with `modal run blender_video.py`.
+#
+# In that function, we use `render.map` to map the `render` function over a `range` of `angle`s,
+# so that the logo will appear to spin in the final video.
+#
+# We collect the bytes from each frame into a `list` locally and then send it to `combine` with `.remote`.
+#
+# The bytes for the video come back to our local machine, and we write them to a file.
+#
+# The whole rendering process (for six seconds of 1080p 60 FPS video) takes between five and ten minutes on 10 T4 GPUs.
+
+
+@app.local_entrypoint()
+def main():
+    output_directory = Path("/tmp") / "render"
+    output_directory.mkdir(parents=True, exist_ok=True)
+    filename = output_directory / "output.gif"
+    with open(filename, "wb") as out_file:
+        out_file.write(
+            combine.remote(list(render.map(range(0, 360, 360 // NUM_FRAMES))))
+        )
+    print(f"Image saved to {filename}")
+
+
+# ## Addenda
+#
+# The remainder of the code in this example defines the details of the render.
+# It's not particularly interesting, so we put it the end of the file.
+
+
+def add_prism(ctx, location, initial_rotation, angle, material):
+    """Add a prism at a given location, rotation, and angle, made of the provided material."""
+    import bpy
+    import mathutils
+
+    bpy.ops.mesh.primitive_cube_add(size=2, location=location)
+    obj = ctx.object  # the newly created object
+
+    # assign the material to the object
+    obj.data.materials.append(material)
+
+    obj.scale = (1, 1, 2)  # square base, 2x taller than wide
+    # Modal logo is rotated 45 degrees
+    obj.rotation_euler[1] = math.radians(initial_rotation)
+
+    # apply initial transformations
+    bpy.ops.object.transform_apply(location=True, rotation=True, scale=True)
+
+    # to "animate" the rendering, we rotate the prisms around the Z axis
+    angle_radians = math.radians(angle)
+    rotation_matrix = mathutils.Matrix.Rotation(angle_radians, 4, "Z")
+    obj.matrix_world = rotation_matrix @ obj.matrix_world
+    bpy.ops.object.transform_apply(location=True, rotation=True, scale=True)
+
+
+def create_iridescent_material():
+    import bpy
+
+    mat = bpy.data.materials.new(name="IridescentGreen")
+    mat.use_nodes = True
+    nodes = mat.node_tree.nodes
+    links = mat.node_tree.links
+
+    nodes.clear()
+
+    output_node = nodes.new(type="ShaderNodeOutputMaterial")
+    emission_node = nodes.new(type="ShaderNodeEmission")
+    layer_weight = nodes.new(type="ShaderNodeLayerWeight")
+    color_ramp = nodes.new(type="ShaderNodeValToRGB")
+
+    color_ramp.color_ramp.elements[0].color = (0, 0, 0, 1)
+    color_ramp.color_ramp.elements[1].color = (0, 1, 0, 1)
+    layer_weight.inputs["Blend"].default_value = 0.4
+
+    links.new(layer_weight.outputs["Fresnel"], color_ramp.inputs["Fac"])
+    links.new(color_ramp.outputs["Color"], emission_node.inputs["Color"])
+
+    emission_node.inputs["Strength"].default_value = 5.0
+    emission_node.inputs["Color"].default_value = (0.0, 1.0, 0.0, 1)
+
+    links.new(emission_node.outputs["Emission"], output_node.inputs["Surface"])
+
+    return mat

From f23c5c20168e5585341d39a27cf825b5117c6c4c Mon Sep 17 00:00:00 2001
From: "devin-ai-integration[bot]"
 <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Sun, 28 Apr 2024 23:53:47 -0700
Subject: [PATCH 06/17] Add detailed example for Fooocus on Modal (#721)

* Add Fooocus Modal example with detailed comments and Markdown sections

* edits fooocus example

---------

Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Charles Frye <charles@modal.com>
---
 misc/run_fooocus.py | 99 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 misc/run_fooocus.py

diff --git a/misc/run_fooocus.py b/misc/run_fooocus.py
new file mode 100644
index 000000000..904f0d530
--- /dev/null
+++ b/misc/run_fooocus.py
@@ -0,0 +1,99 @@
+# # Generate: Fooocus
+#
+# This example demonstrates how to set up and run a web server using the Modal library with Fooocus as the frontend.
+# Fooocus provides a beginner-friendly interface to work with the SDXL 1.0 model for image generation tasks.
+# The script includes the setup of a Docker image, initialization of Fooocus, and launching a web server with GPU support.
+#
+# ## Basic setup
+
+import modal
+
+# To create an image that can run Fooocus, we start from an official NVIDIA base image and then add Python
+# and a few system packages.
+#
+# We then download the Fooocus repository.
+
+image = (
+    modal.Image.from_registry(
+        "nvidia/cuda:12.3.1-base-ubuntu22.04", add_python="3.10"
+    )
+    .apt_install(
+        "software-properties-common",
+        "git",
+        "git-lfs",
+        "coreutils",
+        "aria2",
+        "libgl1",
+        "libglib2.0-0",
+        "curl",
+        "wget",
+        "libsm6",
+        "libxrender1",
+        "libxext6",
+        "ffmpeg",
+    )
+    .run_commands("git clone https://github.com/lllyasviel/Fooocus.git")
+)
+
+# ## Initialize Fooocus
+#
+# We are not limited to running shell commands and package installers in the image setup.
+# We can also run Python functions by defining them in our code and passing them to the `run_function` method.
+#
+# This function installs Fooocus's dependencies and downloads the SDXL 1.0 model to the container image.
+#
+# This all happens at the time the container image is defined, so that the image is ready to run Fooocus when it is deployed.
+
+
+def init_Fooocus():
+    import os
+    import subprocess
+
+    # change the working directory to the Fooocus directory and install the required Python packages from the requirements file.
+    os.chdir("/Fooocus")
+    os.system("pip install -r requirements_versions.txt")
+
+    # change the directory to the models' checkpoints and download the SDXL 1.0 model using wget.
+    os.chdir("./models/checkpoints")
+    subprocess.run(
+        "wget -O juggernautXL_v8Rundiffusion.safetensors 'https://huggingface.co/lllyasviel/fav_models/resolve/main/fav/juggernautXL_v8Rundiffusion.safetensors'",
+        shell=True,
+    )
+
+
+GPU_CONFIG = modal.gpu.T4()
+image = image.run_function(init_Fooocus, gpu=GPU_CONFIG)
+
+# ## Run Fooocus
+#
+# The `run` function is decorated with `app.function` to define it as a Modal function.
+# The `web_server` decorator indicates that this function will serve a web application on the specified port.
+# We increase the startup timeout to three minutes to account for the time it takes to load the model and start the server.
+
+app = modal.App("Fooocus", image=image)
+
+PORT = 8000
+MINUTES = 60
+
+
+@app.function(gpu=GPU_CONFIG, timeout=10 * MINUTES)
+@modal.web_server(port=PORT, startup_timeout=3 * MINUTES)
+def run():
+    import os
+    import subprocess
+
+    # change the working directory to the Fooocus directory.
+    os.chdir("/Fooocus")
+
+    # launch the Fooocus application using a subprocess that listens on the specified port
+    subprocess.Popen(
+        [
+            "python",
+            "launch.py",
+            "--listen",
+            "0.0.0.0",
+            "--port",
+            str(PORT),
+            "--always-high-vram",
+        ]
+    )

From f87d12deca8f8e14072dd91461ad17d87546ecd8 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Mon, 29 Apr 2024 14:21:10 -0700
Subject: [PATCH 07/17] remove unnecessary nesting of instructor (#725)

will add local inference version as a peer when ready
---
 .../llm-structured/{instructor => }/instructor_generate.py        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename 06_gpu_and_ml/llm-structured/{instructor => }/instructor_generate.py (100%)

diff --git a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor_generate.py
similarity index 100%
rename from 06_gpu_and_ml/llm-structured/instructor/instructor_generate.py
rename to 06_gpu_and_ml/llm-structured/instructor_generate.py

From f3341012c93f69fdd4729809de948d8753d1d04d Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Mon, 29 Apr 2024 19:15:01 -0700
Subject: [PATCH 08/17] faster renders, numbers on throughput and latency
 (#726)

---
 06_gpu_and_ml/blender/blender_video.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py
index f5cc55622..ee1ed85a7 100644
--- a/06_gpu_and_ml/blender/blender_video.py
+++ b/06_gpu_and_ml/blender/blender_video.py
@@ -137,6 +137,7 @@ def configure_rendering(ctx, with_gpu: bool):
     ctx.scene.render.resolution_x = 1920
     ctx.scene.render.resolution_y = 1080
     ctx.scene.render.resolution_percentage = 100
+    ctx.scene.cycles.samples = 128
 
     # add GPU acceleration if available
     if with_gpu:
@@ -220,7 +221,8 @@ def combine(
 #
 # The bytes for the video come back to our local machine, and we write them to a file.
 #
-# The whole rendering process (for six seconds of 1080p 60 FPS video) takes between five and ten minutes on 10 T4 GPUs.
+# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 T4 GPUs,
+# with a per-frame latency of under 10 seconds, and about two minutes to run on 100 CPUs, with a per-frame latency of about 30 seconds.
 
 
 @app.local_entrypoint()

From 54c379561c03fde020d43014721bd7a857fe875d Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Thu, 2 May 2024 19:30:03 -0700
Subject: [PATCH 09/17] fixes newly-gated models in certain examples (#727)

* remove extra line

* adds instructions for handling gated model

* handles gating for Mistral 7B in outlines example
---
 06_gpu_and_ml/llm-serving/tgi_mixtral.py          |  1 -
 06_gpu_and_ml/llm-serving/vllm_inference.py       |  9 +++++++++
 06_gpu_and_ml/llm-structured/outlines_generate.py | 14 ++++++++++++--
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/06_gpu_and_ml/llm-serving/tgi_mixtral.py b/06_gpu_and_ml/llm-serving/tgi_mixtral.py
index c4313043c..5ca7da284 100644
--- a/06_gpu_and_ml/llm-serving/tgi_mixtral.py
+++ b/06_gpu_and_ml/llm-serving/tgi_mixtral.py
@@ -48,7 +48,6 @@
 # We can use the included utilities to download the model weights (and convert to safetensors, if necessary)
 # as part of the image build.
 #
-#
 # For this step to work on a [gated model](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/gated_model_access)
 # like Mixtral 8x7B, the `HF_TOKEN` environment variable must be set.
 #
diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py
index 9f6b23a6a..3f67aa908 100644
--- a/06_gpu_and_ml/llm-serving/vllm_inference.py
+++ b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -36,6 +36,13 @@
 # ### Download the weights
 # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
 #
+# For this step to work on a [gated model](https://huggingface.co/docs/hub/en/models-gated)
+# like Mistral 7B, the `HF_TOKEN` environment variable must be set.
+#
+# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens)
+# and accepting the [terms of use](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1),
+# head to the [secrets page](https://modal.com/secrets) to share it with Modal as `huggingface-secret`.
+#
 # Tip: avoid using global variables in this function.
 # Changes to code outside this function will not be detected, and the download step will not re-run.
 def download_model_to_image(model_dir, model_name):
@@ -48,6 +55,7 @@ def download_model_to_image(model_dir, model_name):
         model_name,
         local_dir=model_dir,
         ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+        token=os.environ["HF_TOKEN"],
     )
     move_cache()
 
@@ -71,6 +79,7 @@ def download_model_to_image(model_dir, model_name):
         download_model_to_image,
         timeout=60 * 20,
         kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME},
+        secrets=[modal.Secret.from_name("huggingface-secret")],
     )
 )
 
diff --git a/06_gpu_and_ml/llm-structured/outlines_generate.py b/06_gpu_and_ml/llm-structured/outlines_generate.py
index 19e7ae763..b54acadbf 100644
--- a/06_gpu_and_ml/llm-structured/outlines_generate.py
+++ b/06_gpu_and_ml/llm-structured/outlines_generate.py
@@ -24,7 +24,7 @@
 #  First, you'll want to build an image and install the relevant Python dependencies:
 # `outlines` and a Hugging Face inference stack.
 
-from modal import App, Image, gpu
+from modal import App, Image, Secret, gpu
 
 app = App(
     name="outlines-app"
@@ -42,6 +42,13 @@
 # Next, we download the Mistral-7B model from Hugging Face.
 # We do this as part of the definition of our Modal image so that
 # we don't need to download it every time our inference function is run.
+#
+# For this step to work on a [gated model](https://huggingface.co/docs/hub/en/models-gated)
+# like Mistral 7B, the `HF_TOKEN` environment variable must be set.
+#
+# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens)
+# and accepting the [terms of use](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1),
+# head to the [secrets page](https://modal.com/secrets) to share it with Modal as `huggingface-secret`.
 
 
 def import_model():
@@ -50,7 +57,10 @@ def import_model():
     outlines.models.transformers("mistralai/Mistral-7B-v0.1")
 
 
-outlines_image = outlines_image.run_function(import_model)
+outlines_image = outlines_image.run_function(
+    import_model,
+    secrets=[Secret.from_name("huggingface-secret")],
+)
 
 
 # ## Define the schema

From ad9346a7bf38272470ce20e1a3c6d4f578b2cd2c Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Thu, 2 May 2024 20:09:25 -0700
Subject: [PATCH 10/17] fixes relative path between instructor_generate and
 utils (#728)

---
 06_gpu_and_ml/llm-structured/instructor_generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/06_gpu_and_ml/llm-structured/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor_generate.py
index 242f419d3..57ba0ed0d 100644
--- a/06_gpu_and_ml/llm-structured/instructor_generate.py
+++ b/06_gpu_and_ml/llm-structured/instructor_generate.py
@@ -206,7 +206,7 @@ def get_examples(silent=True):
     We use importlib to avoid the need to define the repo as a package."""
     import importlib
 
-    examples_root = Path(__file__).parent.parent.parent.parent
+    examples_root = Path(__file__).parent.parent.parent
     spec = importlib.util.spec_from_file_location(
         "utils", f"{examples_root}/internal/utils.py"
     )

From 8ff22cc373be59f9331f55c7aff799a41b7c0360 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Thu, 2 May 2024 22:34:50 -0700
Subject: [PATCH 11/17] centers logo, positions prisms, nicer material, match
 CPU + GPU throughput (#729)

---
 06_gpu_and_ml/blender/blender_video.py | 63 +++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 12 deletions(-)

diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py
index ee1ed85a7..c07547691 100644
--- a/06_gpu_and_ml/blender/blender_video.py
+++ b/06_gpu_and_ml/blender/blender_video.py
@@ -6,9 +6,9 @@
 # This example shows how you can render an animated 3D scene using
 # [Blender](https://www.blender.org/)'s Python interface.
 #
-# You can run it on CPUs to scale out on one hundred of containers
+# You can run it on CPUs to scale out on one hundred containers
 # or run it on GPUs to get higher throughput per node.
-# Even with this simple scene, GPUs render 2x faster than CPUs.
+# Even with this simple scene, GPUs render 10x faster than CPUs.
 #
 # The final render looks something like this:
 #
@@ -60,7 +60,7 @@
 
 
 @app.function(
-    gpu="T4" if WITH_GPU else None,
+    gpu="A10G" if WITH_GPU else None,
     concurrency_limit=10
     if WITH_GPU
     else 100,  # default limits on Modal free tier
@@ -104,12 +104,33 @@ def render(angle: int = 0) -> bytes:
     # add the Modal logo: two neon green rectangular prisms
     iridescent_material = create_iridescent_material()
 
-    add_prism(ctx, (-1, 0, 0), 45, angle, iridescent_material)
-    add_prism(ctx, (3, 0, 0), -45, angle, iridescent_material)
+    add_prism(ctx, (-2.07, -1, 0), 45, angle, iridescent_material)
+    add_prism(ctx, (2.07, 1, 0), -45, angle, iridescent_material)
 
-    # set up the lighting and camera
+    # set up the lighting
+    # warm key light
     bpy.ops.object.light_add(type="POINT", location=(5, 5, 5))
-    bpy.context.object.data.energy = 10
+    key_light = bpy.context.object
+    key_light.data.energy = 100
+    key_light.data.color = (1, 0.8, 0.5)  # warm
+
+    # tight, cool spotlight
+    bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6))
+    spot_light = bpy.context.object
+    spot_light.data.energy = 500
+    spot_light.data.spot_size = 0.5
+    spot_light.data.color = (0.8, 0.8, 1)  # cool
+    spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4)
+
+    # soft overall illumination
+    bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5))
+    area_light = bpy.context.object
+    area_light.data.energy = 50  # softer
+    area_light.data.size = 5  # larger
+    area_light.data.color = (1, 1, 1)  # neutral
+    area_light.rotation_euler = (3.14 / 2, 0, 3.14)
+
+    # add camera
     bpy.ops.object.camera_add(location=(7, -7, 5))
     scene.camera = bpy.context.object
     ctx.object.rotation_euler = (1.1, 0, 0.785)
@@ -221,8 +242,8 @@ def combine(
 #
 # The bytes for the video come back to our local machine, and we write them to a file.
 #
-# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 T4 GPUs,
-# with a per-frame latency of under 10 seconds, and about two minutes to run on 100 CPUs, with a per-frame latency of about 30 seconds.
+# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 A10G GPUs,
+# with a per-frame latency of about 10 seconds, and about five minutes to run on 100 CPUs, with a per-frame latency of about one minute.
 
 
 @app.local_entrypoint()
@@ -251,6 +272,11 @@ def add_prism(ctx, location, initial_rotation, angle, material):
     bpy.ops.mesh.primitive_cube_add(size=2, location=location)
     obj = ctx.object  # the newly created object
 
+    bevel = obj.modifiers.new(name="Bevel", type="BEVEL")
+    bevel.width = 0.2
+    bevel.segments = 5
+    bevel.profile = 1.0
+
     # assign the material to the object
     obj.data.materials.append(material)
 
@@ -278,13 +304,22 @@ def create_iridescent_material():
 
     nodes.clear()
 
-    output_node = nodes.new(type="ShaderNodeOutputMaterial")
+    principled_node = nodes.new(type="ShaderNodeBsdfPrincipled")
+
     emission_node = nodes.new(type="ShaderNodeEmission")
     layer_weight = nodes.new(type="ShaderNodeLayerWeight")
     color_ramp = nodes.new(type="ShaderNodeValToRGB")
 
+    mix_shader_node = nodes.new(type="ShaderNodeMixShader")
+
+    output_node = nodes.new(type="ShaderNodeOutputMaterial")
+
+    principled_node.inputs["Base Color"].default_value = (1, 1, 1, 1)
+    principled_node.inputs["Metallic"].default_value = 1.0
+    principled_node.inputs["Roughness"].default_value = 0.5
+
     color_ramp.color_ramp.elements[0].color = (0, 0, 0, 1)
-    color_ramp.color_ramp.elements[1].color = (0, 1, 0, 1)
+    color_ramp.color_ramp.elements[1].color = (0, 0.5, 0, 1)
     layer_weight.inputs["Blend"].default_value = 0.4
 
     links.new(layer_weight.outputs["Fresnel"], color_ramp.inputs["Fac"])
@@ -293,6 +328,10 @@ def create_iridescent_material():
     emission_node.inputs["Strength"].default_value = 5.0
     emission_node.inputs["Color"].default_value = (0.0, 1.0, 0.0, 1)
 
-    links.new(emission_node.outputs["Emission"], output_node.inputs["Surface"])
+    links.new(emission_node.outputs["Emission"], mix_shader_node.inputs[1])
+    links.new(principled_node.outputs["BSDF"], mix_shader_node.inputs[2])
+    links.new(layer_weight.outputs["Fresnel"], mix_shader_node.inputs["Fac"])
+
+    links.new(mix_shader_node.outputs["Shader"], output_node.inputs["Surface"])
 
     return mat

From 03c44cb42a7440fc31ef00631f1a0cf0589161bb Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Fri, 3 May 2024 10:19:43 -0700
Subject: [PATCH 12/17] refactors lighting out of main render function (#730)

---
 06_gpu_and_ml/blender/blender_video.py | 52 ++++++++++++++------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py
index c07547691..c58c4cbf4 100644
--- a/06_gpu_and_ml/blender/blender_video.py
+++ b/06_gpu_and_ml/blender/blender_video.py
@@ -107,30 +107,8 @@ def render(angle: int = 0) -> bytes:
     add_prism(ctx, (-2.07, -1, 0), 45, angle, iridescent_material)
     add_prism(ctx, (2.07, 1, 0), -45, angle, iridescent_material)
 
-    # set up the lighting
-    # warm key light
-    bpy.ops.object.light_add(type="POINT", location=(5, 5, 5))
-    key_light = bpy.context.object
-    key_light.data.energy = 100
-    key_light.data.color = (1, 0.8, 0.5)  # warm
-
-    # tight, cool spotlight
-    bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6))
-    spot_light = bpy.context.object
-    spot_light.data.energy = 500
-    spot_light.data.spot_size = 0.5
-    spot_light.data.color = (0.8, 0.8, 1)  # cool
-    spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4)
-
-    # soft overall illumination
-    bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5))
-    area_light = bpy.context.object
-    area_light.data.energy = 50  # softer
-    area_light.data.size = 5  # larger
-    area_light.data.color = (1, 1, 1)  # neutral
-    area_light.rotation_euler = (3.14 / 2, 0, 3.14)
-
-    # add camera
+    # add lighting and camera
+    add_lighting()
     bpy.ops.object.camera_add(location=(7, -7, 5))
     scene.camera = bpy.context.object
     ctx.object.rotation_euler = (1.1, 0, 0.785)
@@ -335,3 +313,29 @@ def create_iridescent_material():
     links.new(mix_shader_node.outputs["Shader"], output_node.inputs["Surface"])
 
     return mat
+
+
+def add_lighting():
+    import bpy
+
+    # warm key light
+    bpy.ops.object.light_add(type="POINT", location=(5, 5, 5))
+    key_light = bpy.context.object
+    key_light.data.energy = 100
+    key_light.data.color = (1, 0.8, 0.5)  # warm
+
+    # tight, cool spotlight
+    bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6))
+    spot_light = bpy.context.object
+    spot_light.data.energy = 500
+    spot_light.data.spot_size = 0.5
+    spot_light.data.color = (0.8, 0.8, 1)  # cool
+    spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4)
+
+    # soft overall illumination
+    bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5))
+    area_light = bpy.context.object
+    area_light.data.energy = 50  # softer
+    area_light.data.size = 5  # larger
+    area_light.data.color = (1, 1, 1)  # neutral
+    area_light.rotation_euler = (3.14 / 2, 0, 3.14)

From 5923bff5ab734633ae06b6ab4493838014794d06 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Fri, 3 May 2024 10:36:41 -0700
Subject: [PATCH 13/17] adds rate limit handler from slack SDK (#731)

---
 10_integrations/webscraper.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/10_integrations/webscraper.py b/10_integrations/webscraper.py
index e85135b08..d817b323e 100644
--- a/10_integrations/webscraper.py
+++ b/10_integrations/webscraper.py
@@ -39,7 +39,9 @@ async def get_links(url: str) -> set[str]:
     return set(links)
 
 
-slack_sdk_image = modal.Image.debian_slim().pip_install("slack-sdk")
+slack_sdk_image = modal.Image.debian_slim(python_version="3.10").pip_install(
+    "slack-sdk==3.27.1"
+)
 
 
 @app.function(
@@ -48,9 +50,13 @@ async def get_links(url: str) -> set[str]:
 )
 def bot_token_msg(channel, message):
     import slack_sdk
+    from slack_sdk.http_retry.builtin_handlers import RateLimitErrorRetryHandler
 
-    print(f"Posting {message} to #{channel}")
     client = slack_sdk.WebClient(token=os.environ["SLACK_BOT_TOKEN"])
+    rate_limit_handler = RateLimitErrorRetryHandler(max_retry_count=3)
+    client.retry_handlers.append(rate_limit_handler)
+
+    print(f"Posting {message} to #{channel}")
     client.chat_postMessage(channel=channel, text=message)
 
 

From e0b46deb9889d25832fb392307e9fdccb52d3528 Mon Sep 17 00:00:00 2001
From: Talha SARI <sarit17@itu.edu.tr>
Date: Sun, 5 May 2024 04:00:56 +0300
Subject: [PATCH 14/17] Fix whisper streaming (#733)

* change endpoint name to transcribe to match example usage

* add remote method to modal function usage

* use aio to convert synch map into asynch

* minor fix

* change sleep to 0, fixed the curl giving error otherwise

* correct old typo

---------

Co-authored-by: Charles Frye <charles@modal.com>
---
 06_gpu_and_ml/openai_whisper/streaming/main.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/06_gpu_and_ml/openai_whisper/streaming/main.py b/06_gpu_and_ml/openai_whisper/streaming/main.py
index cc8ae23b3..676d2b485 100644
--- a/06_gpu_and_ml/openai_whisper/streaming/main.py
+++ b/06_gpu_and_ml/openai_whisper/streaming/main.py
@@ -183,16 +183,16 @@ async def stream_whisper(audio_data: bytes):
         f.flush()
         segment_gen = split_silences(f.name)
 
-    for result in transcribe_segment.starmap(
+    async for result in transcribe_segment.starmap(
         segment_gen, kwargs=dict(audio_data=audio_data, model="base.en")
     ):
-        # Must cooperatively yeild here otherwise `StreamingResponse` will not iteratively return stream parts.
-        # see: https://github.com/python/asyncio/issues/284
-        await asyncio.sleep(0.5)
+        # Must cooperatively yield here otherwise `StreamingResponse` will not iteratively return stream parts.
+        # see: https://github.com/python/asyncio/issues/284#issuecomment-154162668
+        await asyncio.sleep(0)
         yield result["text"]
 
 
-@web_app.get("/")
+@web_app.get("/transcribe")
 async def transcribe(url: str):
     """
     Usage:
@@ -213,7 +213,7 @@ async def transcribe(url: str):
 
     print(f"downloading {url}")
     try:
-        audio_data = download_mp3_from_youtube(url)
+        audio_data = download_mp3_from_youtube.remote(url)
     except pytube.exceptions.RegexMatchError:
         raise HTTPException(
             status_code=422, detail=f"Could not process url {url}"

From a238c9758583ccaeccdcbc217dddee75651cf26e Mon Sep 17 00:00:00 2001
From: bofeng huang <bofenghuang7@gmail.com>
Date: Sun, 5 May 2024 03:08:48 +0200
Subject: [PATCH 15/17] Fix vLLM template (#734)

* Update vllm_mixtral.py

* Fix template

* Fix template
---
 06_gpu_and_ml/llm-serving/vllm_gemma.py     | 2 +-
 06_gpu_and_ml/llm-serving/vllm_inference.py | 4 ++--
 06_gpu_and_ml/llm-serving/vllm_mixtral.py   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py
index 2a3545961..634c6d47a 100644
--- a/06_gpu_and_ml/llm-serving/vllm_gemma.py
+++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py
@@ -121,7 +121,7 @@ class Model:
     @modal.enter()
     def load(self):
         self.template = (
-            "start_of_turn>user\n{user}<end_of_turn>\n<start_of_turn>model"
+            "<start_of_turn>user\n{user}<end_of_turn>\n<start_of_turn>model\n"
         )
 
         # Load the model. Tip: Some models, like MPT, may require `trust_remote_code=true`.
diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py
index 3f67aa908..c24e345db 100644
--- a/06_gpu_and_ml/llm-serving/vllm_inference.py
+++ b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -109,11 +109,11 @@ class Model:
     def load_model(self):
         # Tip: models that are not fully implemented by Hugging Face may require `trust_remote_code=true`.
         self.llm = vllm.LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count)
-        self.template = """<s>[INST] <<SYS>>
+        self.template = """[INST] <<SYS>>
 {system}
 <</SYS>>
 
-{user} [/INST] """
+{user} [/INST]"""
 
     @modal.method()
     def generate(self, user_questions):
diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py
index 57618ae28..eb236b9cb 100644
--- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py
+++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py
@@ -121,7 +121,7 @@ def start_engine(self):
             disable_log_stats=True,  # disable logging so we can stream tokens
             disable_log_requests=True,
         )
-        self.template = "<s> [INST] {user} [/INST] "
+        self.template = "[INST] {user} [/INST]"
 
         # this can take some time!
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)

From 2ac53ebc35b38e30d2288efb3cecaf41f19c8733 Mon Sep 17 00:00:00 2001
From: Akshat Bubna <akshatb42@gmail.com>
Date: Mon, 6 May 2024 00:10:06 -0400
Subject: [PATCH 16/17] install numpy explicitly in wikipedia example (#736)

---
 06_gpu_and_ml/embeddings/wikipedia/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/06_gpu_and_ml/embeddings/wikipedia/main.py b/06_gpu_and_ml/embeddings/wikipedia/main.py
index 95d898c22..0c3ffb5cc 100644
--- a/06_gpu_and_ml/embeddings/wikipedia/main.py
+++ b/06_gpu_and_ml/embeddings/wikipedia/main.py
@@ -78,7 +78,7 @@ def spawn_server() -> subprocess.Popen:
         add_python="3.10",
     )
     .dockerfile_commands("ENTRYPOINT []")
-    .pip_install("httpx")
+    .pip_install("httpx", "numpy")
 )
 
 with tei_image.imports():

From 75d6c997ecedb953e81b5b00cadf677eb96aed9f Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Mon, 6 May 2024 10:44:15 -0700
Subject: [PATCH 17/17] Run examples on change (#735)

* inital draft of action to test monitoring

* cleans up monitoring workflow

* more complete draft of monitoring test action

* removes draft monitoring workflow, reorganizes existing workflows

* update internal development requirements

* turn off dry run now that we're going back to prod

* reorganize environment setup

* WIP version of example execution

* adds .secrets file from act

* handles modal serve, proper system exit, drops extra script

* updates actions, better environment setup

* handle PRs with no changed files

* add back dev dependencies for jupytext and pydantic in deploy

* reverts changes to typechecking to avoid slowdown
---
 .github/actions/setup/action.yml   | 36 ++++++++++++++
 .github/workflows/cd.yml           |  8 +---
 .github/workflows/check.yml        | 27 +++--------
 .github/workflows/run-examples.yml | 76 ++++++++++++++++++++++++++++++
 .gitignore                         |  3 ++
 internal/requirements.txt          |  7 ++-
 internal/run_example.py            | 50 ++++++++++++++++++++
 7 files changed, 179 insertions(+), 28 deletions(-)
 create mode 100644 .github/actions/setup/action.yml
 create mode 100644 .github/workflows/run-examples.yml
 create mode 100644 internal/run_example.py

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
new file mode 100644
index 000000000..0312efef1
--- /dev/null
+++ b/.github/actions/setup/action.yml
@@ -0,0 +1,36 @@
+name: setup
+
+description: Set up a Python environment for the examples.
+
+inputs:
+  version:
+    description: Which Python version to install
+    required: false
+    default: "3.11"
+  devDependencies:
+    description: Whether to skip dependencies
+    required: false
+    default: "no-skip"
+
+runs:
+  using: composite
+  steps:
+    - name: Install Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ inputs.version }}
+
+    - name: Install base packages
+      shell: bash
+      run: |
+        pip install uv
+        uv pip install --system setuptools wheel
+
+    - name: Install development Python packages
+      if: ${{ inputs.devDependencies != 'skip' }}
+      shell: bash
+      run: uv pip install --system -r internal/requirements.txt
+
+    - name: Install the modal client
+      shell: bash
+      run: uv pip install --system modal
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
index 50ab209a0..451c08f46 100644
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -17,13 +17,9 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
         with:
-          python-version: "3.9"
-
-      - name: Install Modal client package and jupytext
-        run: pip install modal-client jupytext pydantic~=1.10
+          fetch-depth: 1
+      - uses: ./.github/actions/setup
 
       - name: Run deployment script
         run: |
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
index 389875d8d..9f058e4c0 100644
--- a/.github/workflows/check.yml
+++ b/.github/workflows/check.yml
@@ -13,13 +13,9 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
-
-      # keep version here in sync with .pre-commit-config.yaml and other modal repos
-      - run: pip install ruff==0.2.1
+          fetch-depth: 1
+      - uses: ./.github/actions/setup
 
       - run: ruff check
 
@@ -31,16 +27,14 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
-      - name: Install NbConvert
-        run: pip install jupyter nbconvert
+          fetch-depth: 1
+      - uses: ./.github/actions/setup
 
       - name: Check notebooks are cleaned
         run: |
           jupyter nbconvert --clear-output --inplace 11_notebooks/*.ipynb
-          git diff --quiet && git diff --cached --quiet || exit 1
+          git diff --quiet 11_notebooks/*.ipynb && git diff --cached --quiet 11_notebooks/*.ipynb || exit 1
 
   pytest:
     name: Pytest
@@ -48,16 +42,9 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
-
-      - name: Install dev dependencies
-        run: pip install pytest jupytext pydantic~=1.10
-
-      - name: Install the Modal client
-        run: pip install modal-client
+          fetch-depth: 1
+      - uses: ./.github/actions/setup
 
       - name: Run
         run: pytest -v .
diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml
new file mode 100644
index 000000000..bf27d0adb
--- /dev/null
+++ b/.github/workflows/run-examples.yml
@@ -0,0 +1,76 @@
+name: Run
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**.py"
+  push:
+    branches:
+      - main
+    paths:
+      - "**.py"
+  workflow_dispatch:
+
+# Cancel previous runs of the same PR but do not cancel previous runs on main
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+env:
+  TERM: linux
+  TERMINFO: /etc/terminfo
+  MODAL_TOKEN_ID: ${{ secrets.MODAL_MODAL_LABS_TOKEN_ID }}
+  MODAL_TOKEN_SECRET: ${{ secrets.MODAL_MODAL_LABS_TOKEN_SECRET }}
+  MODAL_ENVIRONMENT: main
+
+jobs:
+  # Output all changed files in a JSON format compatible with GitHub Actions job matrices
+  diff-matrix:
+    name: Generate matrix of changed examples
+    runs-on: ubuntu-20.04
+    outputs:
+      matrix: ${{ steps.diff.outputs.all_changed_files }}
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Find changed examples
+        id: diff
+        uses: tj-actions/changed-files@v44
+        with:
+          files: "**.py"
+          files_ignore: "internal/**,misc/**"
+          matrix: true
+
+      - name: List all changed examples
+        run: echo '${{ steps.diff.outputs.all_changed_files }}'
+
+  # Run each changed example, using the output of the previous step as a job matrix
+  run-changed:
+    name: Run changed example
+    needs: [diff-matrix]
+    if:
+      ${{ needs.diff-matrix.outputs.matrix != '[]' &&
+      needs.diff-matrix.outputs.matrix != '' }}
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        file: ${{ fromJson(needs.diff-matrix.outputs.matrix) }}
+      fail-fast: false
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 1
+      - uses: ./.github/actions/setup
+
+      - name: Run example
+        run: |
+          echo "Running ${{ matrix.file }}"
+          stem=$(basename "${{ matrix.file }}" .py)
+          python3 -m internal.run_example $stem || exit $?
diff --git a/.gitignore b/.gitignore
index 53fe8b69e..3218fc050 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,6 @@
 
 venv
 .venv
+
+# secrets file for act, tool for local GitHub Actions testing
+.secrets
diff --git a/internal/requirements.txt b/internal/requirements.txt
index 42bf85702..5c5120ec8 100644
--- a/internal/requirements.txt
+++ b/internal/requirements.txt
@@ -1,5 +1,8 @@
-modal
 pytest
+jupyter
+ipython
+nbconvert
 jupytext~=1.16.1
 pydantic~=1.10.14
-mypy==0.950
+mypy==1.2.0
+ruff==0.2.1
diff --git a/internal/run_example.py b/internal/run_example.py
new file mode 100644
index 000000000..3b06a3cb0
--- /dev/null
+++ b/internal/run_example.py
@@ -0,0 +1,50 @@
+import os
+import subprocess
+import sys
+import time
+
+from . import utils
+
+MINUTES = 60
+TIMEOUT = 12 * MINUTES
+
+
+def run_script(example):
+    t0 = time.time()
+
+    try:
+        print(f"cli args: {example.cli_args}")
+        process = subprocess.run(
+            example.cli_args,
+            env=os.environ | {"MODAL_SERVE_TIMEOUT": "5.0"},
+            timeout=TIMEOUT,
+        )
+        total_time = time.time() - t0
+        if process.returncode == 0:
+            print(f"Success after {total_time:.2f}s :)")
+        else:
+            print(
+                f"Failed after {total_time:.2f}s with return code {process.returncode} :("
+            )
+
+        returncode = process.returncode
+
+    except subprocess.TimeoutExpired:
+        print(f"Past timeout of {TIMEOUT}s :(")
+        returncode = 999
+
+    return returncode
+
+
+def run_single_example(stem):
+    examples = utils.get_examples()
+    for example in examples:
+        if stem == example.stem:
+            return run_script(example)
+    else:
+        print(f"Could not find example name {stem}")
+        return 0
+
+
+if __name__ == "__main__":
+    sys.exit(run_single_example(sys.argv[1]))