From 1ea38471ad13b02791de9bf8b95206d2a33c734a Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 25 Apr 2024 14:56:44 -0700 Subject: [PATCH 01/17] adds TRT-LLM example (#717) --- 06_gpu_and_ml/llm-serving/trtllm_llama.py | 617 ++++++++++++++++++++++ 1 file changed, 617 insertions(+) create mode 100644 06_gpu_and_ml/llm-serving/trtllm_llama.py diff --git a/06_gpu_and_ml/llm-serving/trtllm_llama.py b/06_gpu_and_ml/llm-serving/trtllm_llama.py new file mode 100644 index 000000000..08bd46569 --- /dev/null +++ b/06_gpu_and_ml/llm-serving/trtllm_llama.py @@ -0,0 +1,617 @@ +# # Serverless TensorRT-LLM (LLaMA 3 8B) +# +# In this example, we demonstrate how to use the TensorRT-LLM framework to serve Meta's LLaMA 3 8B model +# at a total throughput of roughly 4,500 output tokens per second on a single NVIDIA A100 40GB GPU. +# At [Modal's on-demand rate](https://modal.com/pricing) of ~$4/hr, that's under $0.20 per million tokens -- +# on auto-scaling infrastructure and served via a customizable API. +# +# Additional optimizations like speculative sampling and FP8 quantization can further improve throughput. +# For more on the throughput levels that are possible with TensorRT-LLM for different combinations +# of model, hardware, and workload, see the +# [official benchmarks](https://github.com/NVIDIA/TensorRT-LLM/blob/71d8d4d3dc655671f32535d6d2b60cab87f36e87/docs/source/performance.md). +# +# ## Overview +# +# This guide is intended to document two things: +# the general process for building TensorRT-LLM on Modal +# and a specific configuration for serving the LLaMA 3 8B model. +# +# ### Build process +# +# Any given TensorRT-LLM service requires a multi-stage build process, +# starting from model weights and ending with a compiled engine. +# Because that process touches many sharp-edged high-performance components +# across the stack, it can easily go wrong in subtle and hard-to-debug ways +# that are idiosyncratic to specific systems. +# And debugging GPU workloads is expensive! +# +# This example builds an entire service from scratch, from downloading weight tensors +# to responding to requests, and so serves as living, interactive documentation of a TensorRT-LLM +# build process that works on Modal. +# +# ### Engine configuration +# +# TensorRT-LLM is the Lamborghini of inference engines: it achieves seriously +# impressive performance, but only if you tune it carefully. +# We carefully document the choices we made here and point to additional resources +# so you know where and how you might adjust the parameters for your use case. +# +# ## Installing TensorRT-LLM +# +# To run TensorRT-LLM, we must first install it. Easier said than done! +# +# In Modal, we define [container images](https://modal.com/docs/guide/custom-containers) that run our serverless workloads. +# All Modal containers have access to GPU drivers via the underlying host environment, +# but we still need to install the software stack on top of the drivers, from the CUDA runtime up. +# +# We start from the official `nvidia/cuda:12.1.1-devel-ubuntu22.04` image, +# which includes the CUDA runtime & development libraries +# and the environment configuration necessary to run them. + +import modal + +tensorrt_image = modal.Image.from_registry( + "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10" +) + +# On top of that, we add some system dependencies of TensorRT-LLM, +# including OpenMPI for distributed communication, some core software like `git`, +# and the `tensorrt_llm` package itself. + +tensorrt_image = tensorrt_image.apt_install( + "openmpi-bin", "libopenmpi-dev", "git", "git-lfs", "wget" +).pip_install( + "tensorrt_llm==0.10.0.dev2024042300", + pre=True, + extra_index_url="https://pypi.nvidia.com", +) + +# Note that we're doing this by [method-chaining](https://quanticdev.com/articles/method-chaining/) +# a number of calls to methods on the `modal.Image`. If you're familiar with +# Dockerfiles, you can think of this as a Pythonic interface to instructions like `RUN` and `CMD`. +# +# End-to-end, this step takes five minutes. +# If you're reading this from top to bottom, +# you might want to stop here and execute the example +# with `modal run trtllm_llama.py` +# so that it runs in the background while you read the rest. +# +# ## Downloading the Model +# +# Next, we download the model we want to serve. In this case, we're using the instruction-tuned +# version of Meta's Llama 3 8B model. +# We use the function below to download the model from the Hugging Face Hub. + +MODEL_DIR = "/root/model/model_input" +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_REVISION = "7840f95a8c7a781d3f89c4818bf693431ab3119a" # pin model revisions to prevent unexpected changes! + + +def download_model(): + import os + + from huggingface_hub import snapshot_download + from transformers.utils import move_cache + + os.makedirs(MODEL_DIR, exist_ok=True) + snapshot_download( + MODEL_ID, + local_dir=MODEL_DIR, + ignore_patterns=["*.pt", "*.bin"], # using safetensors + revision=MODEL_REVISION, + ) + move_cache() + + +# Just defining that function doesn't actually download the model, though. +# We can run it by adding it to the image's build process with `run_function`. +# The download process has its own dependencies, which we add here. + +MINUTES = 60 # seconds +tensorrt_image = ( # update the image by downloading the model we're using + tensorrt_image.pip_install( # add utilities for downloading the model + "hf-transfer==0.1.6", + "huggingface_hub==0.22.2", + "requests~=2.31.0", + ) + .env( # hf-transfer: faster downloads, but fewer comforts + {"HF_HUB_ENABLE_HF_TRANSFER": "1"} + ) + .run_function( # download the model + download_model, + timeout=20 * MINUTES, + secrets=[modal.Secret.from_name("huggingface-secret")], + ) +) + +# ## Configuring the model +# +# Now that we have the model downloaded, we need to convert it to a format that TensorRT-LLM can use. +# We use a convenience script provided by the TensorRT-LLM team. +# This script takes a few minutes to run. + +GIT_HASH = "71d8d4d3dc655671f32535d6d2b60cab87f36e87" +CHECKPOINT_SCRIPT_URL = f"https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/{GIT_HASH}/examples/llama/convert_checkpoint.py" + +# TensorRT-LLM requires that a GPU be present to load the model, even though it isn't used directly during this conversion process. +# We'll use a single A100-40GB GPU for this example, but we have also tested it successfully with A10G, A100-80GB, and H100 GPUs. +# +# The most important feature to track when selecting hardware to run on is GPU RAM: +# larger models, longer sequences, and bigger batches all require more memory, +# We tuned all three to maximize throughput on this example. +# +# The amount of GPU RAM on a single card is a tight constraint for most LLMs: +# RAM is measured in tens of gigabytes and +# models have billions of floating point parameters, +# each consuming one to four bytes of memory. +# The performance cliff if you need to spill to CPU memory is steep, +# so the only solution is to split the model across multiple GPUs. +# This is particularly important when serving larger models (e.g. 70B or 8x22B). + +N_GPUS = 1 # Heads up: this example has not yet been tested with multiple GPUs +GPU_CONFIG = modal.gpu.A100(count=N_GPUS) + +# This is also the point where we specify the data type for this model. +# We use IEEE 754-compliant half-precision floats, (`float16`), because we found that it resulted in marginally higher throughput, +# but the model is provided in Google's +# [`bfloat16` format](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format). +# On the latest Ada Lovelace chips, you might use `float8` to reduce GPU RAM usage and speed up inference, +# but note that the FP8 format is very new, so expect rough edges. + +DTYPE = "float16" + +# We put that all together with another invocation of `.run_commands`. + +CKPT_DIR = "/root/model/model_ckpt" +tensorrt_image = ( # update the image by converting the model to TensorRT format + tensorrt_image.run_commands( # takes ~5 minutes + [ + f"wget {CHECKPOINT_SCRIPT_URL} -O /root/convert_checkpoint.py", + f"python /root/convert_checkpoint.py --model_dir={MODEL_DIR} --output_dir={CKPT_DIR}" + + f" --tp_size={N_GPUS} --dtype={DTYPE}", + ], + gpu=GPU_CONFIG, # GPU must be present to load tensorrt_llm + ) +) + +# ## Compiling the engine +# +# TensorRT-LLM achieves its high throughput primarily by compiling the model: +# making concrete choices of CUDA kernels to execute for each operation. +# These kernels are much more specific than `matrix_multiply` or `softmax` -- +# they have names like `maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148t_nt`. +# They are optimized for the specific types and shapes of tensors that the model uses +# and for the specific hardware that the model runs on. +# +# That means we need to know all of that information a priori -- +# more like the original TensorFlow, which defined static graphs, than like PyTorch, +# which builds up a graph of kernels dynamically at runtime. +# +# This extra layer of constraint on our LLM service is precisely +# what allows TensorRT-LLM to achieve its high throughput. +# +# So we need to specify things like the maximum batch size and the lengths of inputs and outputs. +# The closer these are to the actual values we'll use in production, the better the throughput we'll get. + +MAX_INPUT_LEN, MAX_OUTPUT_LEN = 256, 256 +MAX_BATCH_SIZE = ( + 128 # better throughput at larger batch sizes, limited by GPU RAM +) +ENGINE_DIR = "/root/model/model_output" + +SIZE_ARGS = f"--max_batch_size={MAX_BATCH_SIZE} --max_input_len={MAX_INPUT_LEN} --max_output_len={MAX_OUTPUT_LEN}" + +# There are many additional options you can pass to `trtllm-build` to tune the engine for your specific workload. +# You can find the document we used for LLaMA +# [here](https://github.com/NVIDIA/TensorRT-LLM/tree/66ef1df492f7bc9c8eeb01d7e14db01838e3f0bd/examples/llama), +# which you can use to adjust the arguments to fit your workloads, +# e.g. adjusting rotary embeddings and block sizes for longer contexts. +# +# We selected plugins that accelerate two core components of the model: dense matrix multiplication and attention. +# You can read more about the plugin options [here](https://fetch.ai/blog/advancing-llm-optimization). + +PLUGIN_ARGS = f"--gemm_plugin={DTYPE} --gpt_attention_plugin={DTYPE}" + +# We put all of this together with another invocation of `.run_commands`. + +tensorrt_image = ( # update the image by building the TensorRT engine + tensorrt_image.run_commands( # takes ~5 minutes + [ + f"trtllm-build --checkpoint_dir {CKPT_DIR} --output_dir {ENGINE_DIR}" + + f" --tp_size={N_GPUS} --workers={N_GPUS}" + + f" {SIZE_ARGS}" + + f" {PLUGIN_ARGS}" + ], + gpu=GPU_CONFIG, # TRT-LLM compilation is GPU-specific, so make sure this matches production! + ).env( # show more log information from the inference engine + {"TLLM_LOG_LEVEL": "INFO"} + ) +) + +# ## Serving inference at thousands of tokens per second +# +# Now that we have the engine compiled, we can serve it with Modal by creating an `App`. + +app = modal.App(f"example-trtllm-{MODEL_ID}", image=tensorrt_image) + +# Thanks to our custom container runtime system, even this +# large, many gigabyte container boots in seconds. +# +# At container start time, we boot up the engine, which completes in under 30 seconds. +# Container starts are triggered when Modal scales up your infrastructure, +# like the first time you run this code or the first time a request comes in after a period of inactivity. +# +# Container lifecycles in Modal are managed via our `Cls` interface, so we define one below +# to manage the engine and run inference. +# For details, see [this guide](https://modal.com/docs/guide/lifecycle-functions). + + +@app.cls( + gpu=GPU_CONFIG, + secrets=[modal.Secret.from_name("huggingface-secret")], + container_idle_timeout=10 * MINUTES, +) +class Model: + @modal.enter() + def load(self): + """Loads the TRT-LLM engine and configures our tokenizer. + + The @enter decorator ensures that it runs only once per container, when it starts.""" + import time + + print( + f"{COLOR['HEADER']}🥶 Cold boot: spinning up TRT-LLM engine{COLOR['ENDC']}" + ) + self.init_start = time.monotonic_ns() + + import tensorrt_llm + from tensorrt_llm.runtime import ModelRunner + from transformers import AutoTokenizer + + self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + # LLaMA models do not have a padding token, so we use the EOS token + self.tokenizer.add_special_tokens( + {"pad_token": self.tokenizer.eos_token} + ) + # and then we add it from the left, to minimize impact on the output + self.tokenizer.padding_side = "left" + self.pad_id = self.tokenizer.pad_token_id + self.end_id = self.tokenizer.eos_token_id + + runner_kwargs = dict( + engine_dir=f"{ENGINE_DIR}", + lora_dir=None, + rank=tensorrt_llm.mpi_rank(), # this will need to be adjusted to use multiple GPUs + ) + + self.model = ModelRunner.from_dir(**runner_kwargs) + + self.init_duration_s = (time.monotonic_ns() - self.init_start) / 1e9 + print( + f"{COLOR['HEADER']}🚀 Cold boot finished in {self.init_duration_s}s{COLOR['ENDC']}" + ) + + @modal.method() + def generate(self, prompts: list[str], settings=None): + """Generate responses to a batch of prompts, optionally with custom inference settings.""" + import time + + if settings is None: + settings = dict( + temperature=0.1, # temperature 0 not allowed, so we set top_k to 1 to get the same effect + top_k=1, + stop_words_list=None, + repetition_penalty=1.1, + ) + + settings[ + "max_new_tokens" + ] = MAX_OUTPUT_LEN # exceeding this will raise an error + settings["end_id"] = self.end_id + settings["pad_id"] = self.pad_id + + num_prompts = len(prompts) + + if num_prompts > MAX_BATCH_SIZE: + raise ValueError( + f"Batch size {num_prompts} exceeds maximum of {MAX_BATCH_SIZE}" + ) + + print( + f"{COLOR['HEADER']}🚀 Generating completions for batch of size {num_prompts}...{COLOR['ENDC']}" + ) + start = time.monotonic_ns() + + parsed_prompts = [ + self.tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + for prompt in prompts + ] + + print( + f"{COLOR['HEADER']}Parsed prompts:{COLOR['ENDC']}", + *parsed_prompts, + sep="\n\t", + ) + + inputs_t = self.tokenizer( + parsed_prompts, return_tensors="pt", padding=True, truncation=False + )["input_ids"] + + print( + f"{COLOR['HEADER']}Input tensors:{COLOR['ENDC']}", inputs_t[:, :8] + ) + + outputs_t = self.model.generate(inputs_t, **settings) + + outputs_text = self.tokenizer.batch_decode( + outputs_t[:, 0] + ) # only one output per input, so we index with 0 + + responses = [ + extract_assistant_response(output_text) + for output_text in outputs_text + ] + duration_s = (time.monotonic_ns() - start) / 1e9 + + num_tokens = sum( + map(lambda r: len(self.tokenizer.encode(r)), responses) + ) + + for prompt, response in zip(prompts, responses): + print( + f"{COLOR['HEADER']}{COLOR['GREEN']}{prompt}", + f"\n{COLOR['BLUE']}{response}", + "\n\n", + sep=COLOR["ENDC"], + ) + time.sleep(0.01) # to avoid log truncation + + print( + f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {MODEL_ID} in {duration_s:.1f} seconds," + f" throughput = {num_tokens / duration_s:.0f} tokens/second for batch of size {num_prompts} on {GPU_CONFIG}.{COLOR['ENDC']}" + ) + + return responses + + +# ## Calling our inference function +# +# Now, how do we actually run the model? +# +# There are two basic methods: from Python via our SDK or from anywhere, by setting up an API. +# +# ### Calling inference from Python +# +# To run our `Model`'s `.generate` method from Python, we just need to call it -- +# with `.remote` appended to run it on Modal. +# +# We wrap that logic in a `local_entrypoint` so you can run it from the command line with +# ```bash +# modal run trtllm_llama.py +# ``` +# +# For simplicity, we hard-code a batch of 128 questions to ask the model. + + +@app.local_entrypoint() +def main(): + questions = [ + # Generic assistant questions + "What are you?", + "What can you do?", + # Coding + "Implement a Python function to compute the Fibonacci numbers.", + "Write a Rust function that performs binary exponentiation.", + "How do I allocate memory in C?", + "What are the differences between Javascript and Python?", + "How do I find invalid indices in Postgres?", + "How can you implement a LRU (Least Recently Used) cache in Python?", + "What approach would you use to detect and prevent race conditions in a multithreaded application?", + "Can you explain how a decision tree algorithm works in machine learning?", + "How would you design a simple key-value store database from scratch?", + "How do you handle deadlock situations in concurrent programming?", + "What is the logic behind the A* search algorithm, and where is it used?", + "How can you design an efficient autocomplete system?", + "What approach would you take to design a secure session management system in a web application?", + "How would you handle collision in a hash table?", + "How can you implement a load balancer for a distributed system?", + "Implement a Python class for a doubly linked list.", + "Write a Haskell function that generates prime numbers using the Sieve of Eratosthenes.", + "Develop a simple HTTP server in Rust.", + # Literate and creative writing + "What is the fable involving a fox and grapes?", + "Who does Harry turn into a balloon?", + "Write a story in the style of James Joyce about a trip to the Australian outback in 2083 to see robots in the beautiful desert.", + "Write a tale about a time-traveling historian who's determined to witness the most significant events in human history.", + "Describe a day in the life of a secret agent who's also a full-time parent.", + "Create a story about a detective who can communicate with animals.", + "What is the most unusual thing about living in a city floating in the clouds?", + "In a world where dreams are shared, what happens when a nightmare invades a peaceful dream?", + "Describe the adventure of a lifetime for a group of friends who found a map leading to a parallel universe.", + "Tell a story about a musician who discovers that their music has magical powers.", + "In a world where people age backwards, describe the life of a 5-year-old man.", + "Create a tale about a painter whose artwork comes to life every night.", + "What happens when a poet's verses start to predict future events?", + "Imagine a world where books can talk. How does a librarian handle them?", + "Tell a story about an astronaut who discovered a planet populated by plants.", + "Describe the journey of a letter traveling through the most sophisticated postal service ever.", + "Write a tale about a chef whose food can evoke memories from the eater's past.", + "Write a poem in the style of Walt Whitman about the modern digital world.", + "Create a short story about a society where people can only speak in metaphors.", + "What are the main themes in Dostoevsky's 'Crime and Punishment'?", + # History and Philosophy + "What were the major contributing factors to the fall of the Roman Empire?", + "How did the invention of the printing press revolutionize European society?", + "What are the effects of quantitative easing?", + "How did the Greek philosophers influence economic thought in the ancient world?", + "What were the economic and philosophical factors that led to the fall of the Soviet Union?", + "How did decolonization in the 20th century change the geopolitical map?", + "What was the influence of the Khmer Empire on Southeast Asia's history and culture?", + "What led to the rise and fall of the Mongol Empire?", + "Discuss the effects of the Industrial Revolution on urban development in 19th century Europe.", + "How did the Treaty of Versailles contribute to the outbreak of World War II?", + "What led to the rise and fall of the Mongol Empire?", + "Discuss the effects of the Industrial Revolution on urban development in 19th century Europe.", + "How did the Treaty of Versailles contribute to the outbreak of World War II?", + "Explain the concept of 'tabula rasa' in John Locke's philosophy.", + "What does Nietzsche mean by 'ressentiment'?", + "Compare and contrast the early and late works of Ludwig Wittgenstein. Which do you prefer?", + "How does the trolley problem explore the ethics of decision-making in critical situations?", + # Thoughtfulness + "Describe the city of the future, considering advances in technology, environmental changes, and societal shifts.", + "In a dystopian future where water is the most valuable commodity, how would society function?", + "If a scientist discovers immortality, how could this impact society, economy, and the environment?", + "What could be the potential implications of contact with an advanced alien civilization?", + "Describe how you would mediate a conflict between two roommates about doing the dishes using techniques of non-violent communication.", + "If you could design a school curriculum for the future, what subjects would you include to prepare students for the next 50 years?", + "How would society change if teleportation was invented and widely accessible?", + "Consider a future where artificial intelligence governs countries. What are the potential benefits and pitfalls?", + # Math + "What is the product of 9 and 8?", + "If a train travels 120 kilometers in 2 hours, what is its average speed?", + "Think through this step by step. If the sequence a_n is defined by a_1 = 3, a_2 = 5, and a_n = a_(n-1) + a_(n-2) for n > 2, find a_6.", + "Think through this step by step. Calculate the sum of an arithmetic series with first term 3, last term 35, and total terms 11.", + "Think through this step by step. What is the area of a triangle with vertices at the points (1,2), (3,-4), and (-2,5)?", + "Think through this step by step. Solve the following system of linear equations: 3x + 2y = 14, 5x - y = 15.", + # Facts + "Who was Emperor Norton I, and what was his significance in San Francisco's history?", + "What is the Voynich manuscript, and why has it perplexed scholars for centuries?", + "What was Project A119 and what were its objectives?", + "What is the 'Dyatlov Pass incident' and why does it remain a mystery?", + "What is the 'Emu War' that took place in Australia in the 1930s?", + "What is the 'Phantom Time Hypothesis' proposed by Heribert Illig?", + "Who was the 'Green Children of Woolpit' as per 12th-century English legend?", + "What are 'zombie stars' in the context of astronomy?", + "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?", + "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?", + "Which countries in the European Union use currencies other than the Euro, and what are those currencies?", + # Multilingual + "战国时期最重要的人物是谁?", + "Tuende hatua kwa hatua. Hesabu jumla ya mfululizo wa kihesabu wenye neno la kwanza 2, neno la mwisho 42, na jumla ya maneno 21.", + "Kannst du die wichtigsten Eigenschaften und Funktionen des NMDA-Rezeptors beschreiben?", + "¿Cuáles son los principales impactos ambientales de la deforestación en la Amazonía?", + "Décris la structure et le rôle de la mitochondrie dans une cellule.", + "Какие были социальные последствия Перестройки в Советском Союзе?", + # Economics and Business + "What are the principles of behavioral economics and how do they influence consumer choices?", + "Discuss the impact of blockchain technology on traditional banking systems.", + "What are the long-term effects of trade wars on global economic stability?", + "What is the law of supply and demand?", + "Explain the concept of inflation and its typical causes.", + "What is a trade deficit, and why does it matter?", + "How do interest rates affect consumer spending and saving?", + "What is GDP and why is it important for measuring economic health?", + "What is the difference between revenue and profit?", + "Describe the role of a business plan in startup success.", + "How does market segmentation benefit a company?", + "Explain the concept of brand equity.", + "What are the advantages of franchising a business?", + "What are Michael Porter's five forces and how do they impact strategy for tech startups?", + # Science and Technology + "Discuss the potential impacts of quantum computing on data security.", + "How could CRISPR technology change the future of medical treatments?", + "Explain the significance of graphene in the development of future electronics.", + "How do renewable energy sources compare to fossil fuels in terms of environmental impact?", + "What are the most promising technologies for carbon capture and storage?", + "Explain why the sky is blue.", + "What is the principle behind the operation of a microwave oven?", + "How does Newton's third law apply to rocket propulsion?", + "What causes iron to rust?", + "Describe the process of photosynthesis in simple terms.", + "What is the role of a catalyst in a chemical reaction?", + "What is the basic structure of a DNA molecule?", + "How do vaccines work to protect the body from disease?", + "Explain the significance of mitosis in cellular reproduction.", + "What are tectonic plates and how do they affect earthquakes?", + "How does the greenhouse effect contribute to global warming?", + "Describe the water cycle and its importance to Earth's climate.", + "What causes the phases of the Moon?", + "How do black holes form?", + "Explain the significance of the Big Bang theory.", + "What is the function of the CPU in a computer system?", + "Explain the difference between RAM and ROM.", + "How does a solid-state drive (SSD) differ from a hard disk drive (HDD)?", + "What role does the motherboard play in a computer system?", + "Describe the purpose and function of a GPU.", + "What is TensorRT? What role does it play in neural network inference?", + ] + + model = Model() + model.generate.remote(questions) + # if you're calling this service from another Python project, + # use [`Model.lookup`](https://modal.com/docs/reference/modal.Cls#lookup) + + +# ### Calling inference via an API +# +# We can use `modal.web_endpoint` and `app.function` to turn any Python function into a web API. +# +# This API wrapper doesn't need all the dependencies of the core inference service, +# so we switch images here to a basic Linux image, `debian_slim`, which has everything we need. + +web_image = modal.Image.debian_slim(python_version="3.10") + +# From there, we can take the same remote generation logic we used in `main` +# and serve it with only a few more lines of code. + + +@app.function(image=web_image) +@modal.web_endpoint(method="POST") +def generate_web(data: dict): + return Model.generate.remote(data["prompts"], settings=None) + + +# To set our function up as a web endpoint, we need to run this file -- +# with `modal serve` to create a hot-reloading development server or `modal deploy` to deploy it to production. +# +# ```bash +# modal serve trtllm_llama.py +# ``` +# +# You can test the endpoint by sending a POST request with `curl` from another terminal: +# +# ```bash +# curl -X POST url-from-output-of-modal-serve-here \ +# -H "Content-Type: application/json" \ +# -d '{ +# "prompts": ["Tell me a joke", "Describe a dream you had recently", "Share your favorite childhood memory"] +# }' | python -m json.tool # python for pretty-printing, optional +# ``` +# +# And now you have a high-throughput, low-latency, autoscaling API for serving LLaMA 3 8B completions! +# +# ## Footer +# +# The rest of the code in this example is utility code. + + +COLOR = { + "HEADER": "\033[95m", + "BLUE": "\033[94m", + "GREEN": "\033[92m", + "RED": "\033[91m", + "ENDC": "\033[0m", +} + + +def extract_assistant_response(output_text): + """Model-specific code to extract model responses. + + See this doc for LLaMA 3: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/.""" + # Split the output text by the assistant header token + parts = output_text.split("<|start_header_id|>assistant<|end_header_id|>") + + if len(parts) > 1: + # Join the parts after the first occurrence of the assistant header token + response = parts[1].split("<|eot_id|>")[0].strip() + + # Remove any remaining special tokens and whitespace + response = response.replace("<|eot_id|>", "").strip() + + return response + else: + return output_text From 45f75b7377139d168b627e147669965a49749d87 Mon Sep 17 00:00:00 2001 From: Minki Jung <113061064+jung0072@users.noreply.github.com> Date: Thu, 25 Apr 2024 17:57:27 -0400 Subject: [PATCH 02/17] add python version to the image to prevent pip install error (#715) --- 06_gpu_and_ml/llm-serving/vllm_gemma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py index 9cefef401..2a3545961 100644 --- a/06_gpu_and_ml/llm-serving/vllm_gemma.py +++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py @@ -76,7 +76,7 @@ def download_model_to_image(model_dir, model_name): # and save the resulting files to the container image -- that way we don't need # to redownload the weights every time we change the server's code or start up more instances of the server. image = ( - modal.Image.debian_slim() + modal.Image.debian_slim(python_version="3.10") .pip_install( "vllm==0.4.0.post1", "torch==2.1.2", From 0e6f4d817136e9746fd017b587ba14744673b9b2 Mon Sep 17 00:00:00 2001 From: Alonso Astroza Tagle Date: Sun, 28 Apr 2024 23:50:46 -0400 Subject: [PATCH 03/17] python_version in Image declaration (#720) --- 06_gpu_and_ml/llm-serving/vllm_inference.py | 2 +- 06_gpu_and_ml/llm-serving/vllm_mixtral.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index c3d0375c4..9f6b23a6a 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -56,7 +56,7 @@ def download_model_to_image(model_dir, model_name): # We’ll start from Modal's Debian slim image. # Then we’ll use `run_function` with `download_model_to_image` to write the model into the container image. image = ( - modal.Image.debian_slim() + modal.Image.debian_slim(python_version="3.10") .pip_install( "vllm==0.4.0.post1", "torch==2.1.2", diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py index 9bb9471ba..57618ae28 100644 --- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py +++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py @@ -63,7 +63,7 @@ def download_model_to_image(model_dir, model_name, model_revision): # the model are saved within the container image. vllm_image = ( - modal.Image.debian_slim() + modal.Image.debian_slim(python_version="3.10") .pip_install( "vllm==0.4.0.post1", "torch==2.1.2", From dabdf155706bf8f56b3a9b4ebc15afedd95030a3 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Sun, 28 Apr 2024 22:45:00 -0700 Subject: [PATCH 04/17] Clean up instructor example (#722) * removes extra inference file * cleans up instructor example --- .../llm-structured/instructor/inference.py | 79 --------- .../instructor/instructor_generate.py | 161 ++++++++++-------- 2 files changed, 90 insertions(+), 150 deletions(-) delete mode 100644 06_gpu_and_ml/llm-structured/instructor/inference.py diff --git a/06_gpu_and_ml/llm-structured/instructor/inference.py b/06_gpu_and_ml/llm-structured/instructor/inference.py deleted file mode 100644 index d8e765764..000000000 --- a/06_gpu_and_ml/llm-structured/instructor/inference.py +++ /dev/null @@ -1,79 +0,0 @@ -# # Fast inference with vLLM (Mistral 7B) -# -# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) -# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. - -import os -import subprocess - -from modal import App, Image, Secret, gpu, web_server - -MODEL_DIR = "/model" -BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1" - - -# ## Define a container image - - -# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this -# is that the container no longer has to re-download the model from Huggingface - instead, it will take -# advantage of Modal's internal filesystem for faster cold starts. -# -# ### Download the weights -# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. -# -# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. -def download_model_to_folder(): - from huggingface_hub import snapshot_download - from transformers.utils import move_cache - - os.makedirs(MODEL_DIR, exist_ok=True) - - snapshot_download( - BASE_MODEL, - local_dir=MODEL_DIR, - ignore_patterns=["*.pt", "*.bin"], # Using safetensors - ) - move_cache() - - -# ### Image definition -# We'll start from a recommended Docker Hub image and install `vLLM`. -# Then we'll use `run_function` to run the function defined above to ensure the weights of -# the model are saved within the container image. -image = ( - Image.from_registry( - "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10" - ) - .pip_install( - "vllm==0.2.5", - "huggingface_hub==0.19.4", - "hf-transfer==0.1.4", - "torch==2.1.2", - ) - # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. - .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) - .run_function( - download_model_to_folder, - secrets=[Secret.from_name("huggingface")], - timeout=60 * 20, - ) -) - -app = App( - "vllm-inference-openai-compatible", image=image -) # Note: prior to April 2024, "app" was called "stub" - - -GPU_CONFIG = gpu.A100(count=1) # 40GB A100 by default - - -@app.function( - allow_concurrent_inputs=100, - gpu=GPU_CONFIG, -) -@web_server(8000, startup_timeout=90) -def openai_compatible_server(): - target = BASE_MODEL - cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --host 0.0.0.0 --port 8000" - subprocess.Popen(cmd, shell=True) diff --git a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py index ca6d66fec..242f419d3 100644 --- a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py +++ b/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py @@ -3,47 +3,46 @@ # --- # # Structured Data Extraction using `instructor` # -# This example demonstrates how to use the `instructor` library to extract structured data from unstructured text. +# This example demonstrates how to use the `instructor` library to extract structured, schematized data from unstructured text. # -# Structured output is a powerful but under-appreciated feature of LLMs, -# because it makes it easier to connect LLMs to other software, -# for example enabling the ingestion of unstructured data into structured databases. +# Structured output is a powerful but under-appreciated feature of LLMs. +# Structured output allows LLMs and multimodal models to connect to traditional software, +# for example enabling the ingestion of unstructured data like text files into structured databases. +# Applied properly, it makes them an extreme example of the [Robustness Principle](https://en.wikipedia.org/wiki/Robustness_principle) +# Jon Postel formulated for TCP: "Be conservative in what you send, be liberal in what you accept". # -# The unstructured data in this example is the code from the examples in the Modal examples repository -- -# including this one! -# -# We use this exact code to monitor the coverage of the examples -# and to make decisions about which examples to write next! +# The unstructured data used in this example code is the code from the examples in the Modal examples repository -- +# including this example's code! # # The output includes a JSONL file containing, on each line, the metadata extracted from the code in one example. # This can be consumed downstream by other software systems, like a database or a dashboard. # -# We include in this folder a Jupyter notebook with some basic analyses. -# # ## Environment setup # -# We setup the environment our code will run in first. +# We set up the environment our code will run in first. # In Modal, we define environments via [container images](https://modal.com/docs/guide/custom-container), # much like Docker images, by iteratively chaining together commands. # -# This example also uses models from Anthropic, so if you want to run it yourself, -# you'll need to set up a Modal [`Secret`](https://modal.com/docs/guide/secrets) -# called `my-anthropic-secret` for your OpenAI API key. +# Here there's just one command, installing instructor and the Python SDK for Anthropic's LLM API. from pathlib import Path from typing import Literal, Optional import modal +from pydantic import BaseModel, Field image = modal.Image.debian_slim(python_version="3.11").pip_install( - "instructor~=1.0.0", "anthropic~=0.23.1", "matplotlib~=3.8.3" + "instructor~=1.0.0", "anthropic~=0.23.1" ) +# This example uses models from Anthropic, so if you want to run it yourself, +# you'll need to set up a Modal [`Secret`](https://modal.com/docs/guide/secrets) +# called `my-anthropic-secret` for your OpenAI API key. + app = modal.App( image=image, secrets=[modal.Secret.from_name("my-anthropic-secret")] ) # Note: prior to April 2024, "app" was called "stub" - -# ## The overall flow +# ## Running Modal functions from the command line # # We'll run the example by calling `modal run instructor_generate.py` from the command line. # @@ -64,7 +63,7 @@ @app.local_entrypoint() -def main(limit: int = 15, with_opus: bool = False): +def main(limit: int = 1, with_opus: bool = False): # find all of the examples in the repo examples = get_examples() # optionally limit the number of examples we process @@ -72,17 +71,17 @@ def main(limit: int = 15, with_opus: bool = False): examples = [None] # just run on this example else: examples = examples[:limit] - if examples: - # use Modal to map our extraction function over the examples concurrently - results = extract_example_metadata.map( - [ - f"{example.stem}\n" + Path(example.filename).read_text() - if example - else None - for example in examples - ], - kwargs={"with_opus": with_opus}, - ) + # use Modal to map our extraction function over the examples concurrently + results = extract_example_metadata.map( + ( # iterable of file contents + Path(example.filename).read_text() if example else None + for example in examples + ), + ( # iterable of filenames + example.stem if example else None for example in examples + ), + kwargs={"with_opus": with_opus}, + ) # save the results to a local file results_path = Path("/tmp") / "instructor_generate" / "results.jsonl" @@ -97,15 +96,65 @@ def main(limit: int = 15, with_opus: bool = False): f.write(result + "\n") -# ## Extracting JSON from unstructured text with `instructor` +# ## Extracting JSON from unstructured text with `instructor` and Pydantic # -# The real meat of this example is here, in the `extract_example_metadata` function. +# The real meat of this example is in this section, in the `extract_example_metadata` function and its schemas. # -# TODO: write this up -# TODO: refactor classes out of this function, explain separately +# We define a schema for the data we want the LLM to extract, using Pydantic. +# Instructor ensures that the LLM's output matches this schema. +# +# We can use the type system provided by Python and Pydantic to express many useful features +# of the data we want to extract -- ranging from wide-open fields like a `str`ing-valued `summary` +# to constrained fields like `difficulty`, which can only take on value between 1 and 5. + +class ExampleMetadataExtraction(BaseModel): + """Extracted metadata about an example from the Modal examples repo.""" -@app.function(concurrency_limit=5) # watch those rate limits! + summary: str = Field(..., description="A brief summary of the example.") + has_thorough_explanation: bool = Field( + ..., + description="The example contains, in the form of inline comments with markdown formatting, a thorough explanation of what the code does.", + ) + domains: list[ + Literal[ + "artificial_intelligence", + "machine_learning", + "data_science", + "web_serving", + "parallel_computing", + ] + ] = Field(..., description="The") + difficulty: Literal[1, 2, 3, 4, 5] = Field( + ..., + description="The difficulty of the example, from 1 to 5. An example that uses only one or two basic Modal features and is understandable by a professional Python developer familiar with the basics of the relevant domains is a 1, while an example that uses many Modal features and uses advanced Python features like async generator coroutines or metaclasses is a 5.", + ) + freshness: float = Field( + ..., + description="The freshness of the example, from 0 to 1. This is relative to your knowledge cutoff. Examples are less fresh if they use older libraries and tools.", + ) + + +# That schema describes the data to be extracted by the LLM, but not all data is best extracted by an LLM. +# For example, the filename is easily determined in software. +# +# So we inject that information into the output after the LLM has done its work. That necessitates +# an additional schema, which inherits from the first. + + +class ExampleMetadata(ExampleMetadataExtraction): + """Metadata about an example from the Modal examples repo.""" + + filename: Optional[str] = Field( + ..., description="The filename of the example." + ) + + +# With these schemas in hand, it's straightforward to write the function that extracts the metadata. +# Note that we decorate it with `@app.function` to make it run on Modal. + + +@app.function(concurrency_limit=5) # watch those LLM API rate limits! def extract_example_metadata( example_contents: Optional[str] = None, filename: Optional[str] = None, @@ -113,47 +162,16 @@ def extract_example_metadata( ): import instructor from anthropic import Anthropic - from pydantic import BaseModel, Field + # if no example is provided, use the contents of this example if example_contents is None: example_contents = Path(__file__).read_text() filename = Path(__file__).name - class ExampleMetadataExtraction(BaseModel): - """Extracted metadata about an example from the Modal examples repo.""" - - summary: str = Field(..., description="A brief summary of the example.") - has_thorough_explanation: bool = Field( - ..., - description="The example contains, in the form of inline comments with markdown formatting, a thorough explanation of what the code does.", - ) - domains: list[ - Literal[ - "artificial_intelligence", - "machine_learning", - "data_science", - "web_serving", - "parallel_computing", - ] - ] = Field(..., description="The") - difficulty: Literal[1, 2, 3, 4, 5] = Field( - ..., - description="The difficulty of the example, from 1 to 5. An example that uses only one or two basic Modal features and is understandable by a professional Python developer familiar with the basics of the relevant domains is a 1, while an example that uses many Modal features and uses advanced Python features like async generator coroutines or metaclasses is a 5.", - ) - freshness: float = Field( - ..., - description="The freshness of the example, from 0 to 1. This is relative to your knowledge cutoff. Examples are less fresh if they use older libraries and tools.", - ) - - class ExampleMetadata(ExampleMetadataExtraction): - """Metadata about an example from the Modal examples repo.""" - - filename: str = Field(..., description="The filename of the example.") - client = instructor.from_anthropic(Anthropic()) - model = "claude-3-opus-20240229" if with_opus else "claude-3-haiku-20240307" + # add the schema as the `response_model` argument in what otherwise looks like a normal LLM API call extracted_metadata = client.messages.create( model=model, temperature=0.0, @@ -167,18 +185,19 @@ class ExampleMetadata(ExampleMetadataExtraction): ], ) + # inject the filename full_metadata = ExampleMetadata( **extracted_metadata.dict(), filename=filename ) + # return it as JSON return full_metadata.model_dump_json() # ## Addenda # # The rest of the code used in this example is not particularly interesting: -# some boilerplate matplotlib code to generate the figures, -# and a utility function to find all of the examples. +# just a utility function to find all of the examples, which we invoke in the `local_entrypoint` above. def get_examples(silent=True): @@ -195,7 +214,7 @@ def get_examples(silent=True): spec.loader.exec_module(example_utils) examples = [ example - for example in example_utils.get_examples(silent=silent) + for example in example_utils.get_examples() if example.type != 2 # filter out non-code assets ] return examples From d35dd4386b8c5248f5e83e2899f9177fa6cdb1fc Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Sun, 28 Apr 2024 23:24:26 -0700 Subject: [PATCH 05/17] resurrect blender (#723) * adds a refreshed blender example * adds gif of final render --- 06_gpu_and_ml/blender/blender_video.py | 296 +++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 06_gpu_and_ml/blender/blender_video.py diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py new file mode 100644 index 000000000..f5cc55622 --- /dev/null +++ b/06_gpu_and_ml/blender/blender_video.py @@ -0,0 +1,296 @@ +# --- +# output-directory: "/tmp/render" +# --- +# # Render a video with Blender on many GPUs or CPUs in parallel +# +# This example shows how you can render an animated 3D scene using +# [Blender](https://www.blender.org/)'s Python interface. +# +# You can run it on CPUs to scale out on one hundred of containers +# or run it on GPUs to get higher throughput per node. +# Even with this simple scene, GPUs render 2x faster than CPUs. +# +# The final render looks something like this: +# +# ![Spinning Modal logo](https://modal-public-assets.s3.amazonaws.com/modal-blender-render.gif) +# +# ## Defining a Modal app + +import io +import math +from pathlib import Path + +import modal + +# Modal runs your Python functions for you in the cloud. +# You organize your code into apps, collections of functions that work together. + +app = modal.App("examples-blender-logo") + +# We need to define the environment each function runs in -- its container image. +# The block below defines a container image, starting from a basic Debian Linux image +# adding Blender's system-level dependencies +# and then installing the `bpy` package, which is Blender's Python API. + +rendering_image = ( + modal.Image.debian_slim(python_version="3.11") + .apt_install("xorg", "libxkbcommon0") # X11 (Unix GUI) dependencies + .pip_install("bpy") # Blender as a Python package +) + +# ## Rendering a single frame +# +# We define a function that renders a single frame. We'll scale this function out on Modal later. +# +# Functions in Modal are defined along with their hardware and their dependencies. +# This function can be run with GPU acceleration or without it, and we'll use a global flag in the code to switch between the two. + +WITH_GPU = True # try changing this to False to run rendering massively in parallel on CPUs! + +# We decorate the function with `@app.function` to define it as a Modal function. +# Note that in addition to defining the hardware requirements of the function, +# we also specify the container image that the function runs in (the one we defined above). + +# The details of the rendering function aren't too important for this example, +# so we abstract them out into functions defined at the end of the file. +# We draw a simple version of the Modal logo: +# two neon green rectangular prisms facing different directions. +# We include a parameter to rotate the prisms around the vertical/Z axis, +# which we'll use to animate the logo. + + +@app.function( + gpu="T4" if WITH_GPU else None, + concurrency_limit=10 + if WITH_GPU + else 100, # default limits on Modal free tier + image=rendering_image, +) +def render(angle: int = 0) -> bytes: + """ + Renders Modal's logo, two neon green rectangular prisms. + + + Args: + angle: How much to rotate the two prisms around the vertical/Z axis, in degrees. + + Returns: + The rendered frame as a PNG image. + """ + import bpy + + # clear existing objects + bpy.ops.object.select_all(action="DESELECT") + bpy.ops.object.select_by_type(type="MESH") + bpy.ops.object.delete() + + # ctx: the current Blender state, which we mutate + ctx = bpy.context + + # scene: the 3D environment we are rendering and its camera(s) + scene = ctx.scene + + # configure rendering -- CPU or GPU, resolution, etc. + # see function definition below for details + configure_rendering(ctx, WITH_GPU) + + scene.render.image_settings.file_format = "PNG" + scene.render.filepath = "output.png" + + # set background to black + black = (0, 0, 0, 1) + scene.world.node_tree.nodes["Background"].inputs[0].default_value = black + + # add the Modal logo: two neon green rectangular prisms + iridescent_material = create_iridescent_material() + + add_prism(ctx, (-1, 0, 0), 45, angle, iridescent_material) + add_prism(ctx, (3, 0, 0), -45, angle, iridescent_material) + + # set up the lighting and camera + bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) + bpy.context.object.data.energy = 10 + bpy.ops.object.camera_add(location=(7, -7, 5)) + scene.camera = bpy.context.object + ctx.object.rotation_euler = (1.1, 0, 0.785) + + # render + bpy.ops.render.render(write_still=True) + + # return the bytes to the caller + with open(scene.render.filepath, "rb") as image_file: + image_bytes = image_file.read() + + return image_bytes + + +# ### Rendering with acceleration +# +# We can configure the rendering process to use GPU acceleration with NVIDIA CUDA. +# We select the [Cycles rendering engine](https://www.cycles-renderer.org/), which is compatible with CUDA, +# and then activate the GPU. + + +def configure_rendering(ctx, with_gpu: bool): + # configure the rendering process + ctx.scene.render.engine = "CYCLES" + ctx.scene.render.resolution_x = 1920 + ctx.scene.render.resolution_y = 1080 + ctx.scene.render.resolution_percentage = 100 + + # add GPU acceleration if available + if with_gpu: + ctx.preferences.addons[ + "cycles" + ].preferences.compute_device_type = "CUDA" + ctx.scene.cycles.device = "GPU" + + # reload the devices to update the configuration + ctx.preferences.addons["cycles"].preferences.get_devices() + for device in ctx.preferences.addons["cycles"].preferences.devices: + device.use = True + + else: + ctx.scene.cycles.device = "CPU" + + # report rendering devices -- a nice snippet for debugging and ensuring the accelerators are being used + for dev in ctx.preferences.addons["cycles"].preferences.devices: + print( + f"ID:{dev['id']} Name:{dev['name']} Type:{dev['type']} Use:{dev['use']}" + ) + + +# ## Combining frames into a GIF +# +# Rendering 3D images is fun, and GPUs can make it faster, but rendering 3D videos is better! +# We add another function to our app, running on a different, simpler container image +# and different hardware, to combine the frames into a GIF. + +combination_image = modal.Image.debian_slim(python_version="3.11").pip_install( + "pillow==10.3.0" +) + +# The video has a few parameters, which we set here. + +FPS = 60 +FRAME_DURATION_MS = 1000 // FPS +NUM_FRAMES = 360 # drop this for faster iteration while playing around + +# The function to combine the frames into a GIF takes a sequence of byte sequences, one for each rendered frame, +# and converts them into a single sequence of bytes, the GIF. + + +@app.function(image=combination_image) +def combine( + frames_bytes: list[bytes], frame_duration: int = FRAME_DURATION_MS +) -> bytes: + print("🎞️ combining frames into a gif") + from PIL import Image + + frames = [ + Image.open(io.BytesIO(frame_bytes)) for frame_bytes in frames_bytes + ] + + gif_image = io.BytesIO() + frames[0].save( + gif_image, + format="GIF", + save_all=True, + append_images=frames[1:], + duration=frame_duration, + loop=0, + ) + + gif_image.seek(0) + + return gif_image.getvalue() + + +# ## Rendering in parallel in the cloud from the comfort of the command line +# +# With these two functions defined, we need only a few more lines to run our rendering at scale on Modal. +# +# First, we need a function that coordinates our functions to `render` frames and `combine` them. +# We decorate that function with `@app.local_entrypoint` so that we can run it with `modal run blender_video.py`. +# +# In that function, we use `render.map` to map the `render` function over a `range` of `angle`s, +# so that the logo will appear to spin in the final video. +# +# We collect the bytes from each frame into a `list` locally and then send it to `combine` with `.remote`. +# +# The bytes for the video come back to our local machine, and we write them to a file. +# +# The whole rendering process (for six seconds of 1080p 60 FPS video) takes between five and ten minutes on 10 T4 GPUs. + + +@app.local_entrypoint() +def main(): + output_directory = Path("/tmp") / "render" + output_directory.mkdir(parents=True, exist_ok=True) + filename = output_directory / "output.gif" + with open(filename, "wb") as out_file: + out_file.write( + combine.remote(list(render.map(range(0, 360, 360 // NUM_FRAMES)))) + ) + print(f"Image saved to {filename}") + + +# ## Addenda +# +# The remainder of the code in this example defines the details of the render. +# It's not particularly interesting, so we put it the end of the file. + + +def add_prism(ctx, location, initial_rotation, angle, material): + """Add a prism at a given location, rotation, and angle, made of the provided material.""" + import bpy + import mathutils + + bpy.ops.mesh.primitive_cube_add(size=2, location=location) + obj = ctx.object # the newly created object + + # assign the material to the object + obj.data.materials.append(material) + + obj.scale = (1, 1, 2) # square base, 2x taller than wide + # Modal logo is rotated 45 degrees + obj.rotation_euler[1] = math.radians(initial_rotation) + + # apply initial transformations + bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) + + # to "animate" the rendering, we rotate the prisms around the Z axis + angle_radians = math.radians(angle) + rotation_matrix = mathutils.Matrix.Rotation(angle_radians, 4, "Z") + obj.matrix_world = rotation_matrix @ obj.matrix_world + bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) + + +def create_iridescent_material(): + import bpy + + mat = bpy.data.materials.new(name="IridescentGreen") + mat.use_nodes = True + nodes = mat.node_tree.nodes + links = mat.node_tree.links + + nodes.clear() + + output_node = nodes.new(type="ShaderNodeOutputMaterial") + emission_node = nodes.new(type="ShaderNodeEmission") + layer_weight = nodes.new(type="ShaderNodeLayerWeight") + color_ramp = nodes.new(type="ShaderNodeValToRGB") + + color_ramp.color_ramp.elements[0].color = (0, 0, 0, 1) + color_ramp.color_ramp.elements[1].color = (0, 1, 0, 1) + layer_weight.inputs["Blend"].default_value = 0.4 + + links.new(layer_weight.outputs["Fresnel"], color_ramp.inputs["Fac"]) + links.new(color_ramp.outputs["Color"], emission_node.inputs["Color"]) + + emission_node.inputs["Strength"].default_value = 5.0 + emission_node.inputs["Color"].default_value = (0.0, 1.0, 0.0, 1) + + links.new(emission_node.outputs["Emission"], output_node.inputs["Surface"]) + + return mat From f23c5c20168e5585341d39a27cf825b5117c6c4c Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 28 Apr 2024 23:53:47 -0700 Subject: [PATCH 06/17] Add detailed example for Fooocus on Modal (#721) * Add Fooocus Modal example with detailed comments and Markdown sections * edits fooocus example --------- Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Charles Frye --- misc/run_fooocus.py | 99 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 misc/run_fooocus.py diff --git a/misc/run_fooocus.py b/misc/run_fooocus.py new file mode 100644 index 000000000..904f0d530 --- /dev/null +++ b/misc/run_fooocus.py @@ -0,0 +1,99 @@ +# # Generate: Fooocus +# +# This example demonstrates how to set up and run a web server using the Modal library with Fooocus as the frontend. +# Fooocus provides a beginner-friendly interface to work with the SDXL 1.0 model for image generation tasks. +# The script includes the setup of a Docker image, initialization of Fooocus, and launching a web server with GPU support. +# +# ## Basic setup + +import modal + +# To create an image that can run Fooocus, we start from an official NVIDIA base image and then add Python +# and a few system packages. +# +# We then download the Fooocus repository. + +image = ( + modal.Image.from_registry( + "nvidia/cuda:12.3.1-base-ubuntu22.04", add_python="3.10" + ) + .apt_install( + "software-properties-common", + "git", + "git-lfs", + "coreutils", + "aria2", + "libgl1", + "libglib2.0-0", + "curl", + "wget", + "libsm6", + "libxrender1", + "libxext6", + "ffmpeg", + ) + .run_commands("git clone https://github.com/lllyasviel/Fooocus.git") +) + +# ## Initialize Fooocus +# +# We are not limited to running shell commands and package installers in the image setup. +# We can also run Python functions by defining them in our code and passing them to the `run_function` method. +# +# This function installs Fooocus's dependencies and downloads the SDXL 1.0 model to the container image. +# +# This all happens at the time the container image is defined, so that the image is ready to run Fooocus when it is deployed. + + +def init_Fooocus(): + import os + import subprocess + + # change the working directory to the Fooocus directory and install the required Python packages from the requirements file. + os.chdir("/Fooocus") + os.system("pip install -r requirements_versions.txt") + + # change the directory to the models' checkpoints and download the SDXL 1.0 model using wget. + os.chdir("./models/checkpoints") + subprocess.run( + "wget -O juggernautXL_v8Rundiffusion.safetensors 'https://huggingface.co/lllyasviel/fav_models/resolve/main/fav/juggernautXL_v8Rundiffusion.safetensors'", + shell=True, + ) + + +GPU_CONFIG = modal.gpu.T4() +image = image.run_function(init_Fooocus, gpu=GPU_CONFIG) + +# ## Run Fooocus +# +# The `run` function is decorated with `app.function` to define it as a Modal function. +# The `web_server` decorator indicates that this function will serve a web application on the specified port. +# We increase the startup timeout to three minutes to account for the time it takes to load the model and start the server. + +app = modal.App("Fooocus", image=image) + +PORT = 8000 +MINUTES = 60 + + +@app.function(gpu=GPU_CONFIG, timeout=10 * MINUTES) +@modal.web_server(port=PORT, startup_timeout=3 * MINUTES) +def run(): + import os + import subprocess + + # change the working directory to the Fooocus directory. + os.chdir("/Fooocus") + + # launch the Fooocus application using a subprocess that listens on the specified port + subprocess.Popen( + [ + "python", + "launch.py", + "--listen", + "0.0.0.0", + "--port", + str(PORT), + "--always-high-vram", + ] + ) From f87d12deca8f8e14072dd91461ad17d87546ecd8 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Mon, 29 Apr 2024 14:21:10 -0700 Subject: [PATCH 07/17] remove unnecessary nesting of instructor (#725) will add local inference version as a peer when ready --- .../llm-structured/{instructor => }/instructor_generate.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 06_gpu_and_ml/llm-structured/{instructor => }/instructor_generate.py (100%) diff --git a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor_generate.py similarity index 100% rename from 06_gpu_and_ml/llm-structured/instructor/instructor_generate.py rename to 06_gpu_and_ml/llm-structured/instructor_generate.py From f3341012c93f69fdd4729809de948d8753d1d04d Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Mon, 29 Apr 2024 19:15:01 -0700 Subject: [PATCH 08/17] faster renders, numbers on throughput and latency (#726) --- 06_gpu_and_ml/blender/blender_video.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py index f5cc55622..ee1ed85a7 100644 --- a/06_gpu_and_ml/blender/blender_video.py +++ b/06_gpu_and_ml/blender/blender_video.py @@ -137,6 +137,7 @@ def configure_rendering(ctx, with_gpu: bool): ctx.scene.render.resolution_x = 1920 ctx.scene.render.resolution_y = 1080 ctx.scene.render.resolution_percentage = 100 + ctx.scene.cycles.samples = 128 # add GPU acceleration if available if with_gpu: @@ -220,7 +221,8 @@ def combine( # # The bytes for the video come back to our local machine, and we write them to a file. # -# The whole rendering process (for six seconds of 1080p 60 FPS video) takes between five and ten minutes on 10 T4 GPUs. +# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 T4 GPUs, +# with a per-frame latency of under 10 seconds, and about two minutes to run on 100 CPUs, with a per-frame latency of about 30 seconds. @app.local_entrypoint() From 54c379561c03fde020d43014721bd7a857fe875d Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 2 May 2024 19:30:03 -0700 Subject: [PATCH 09/17] fixes newly-gated models in certain examples (#727) * remove extra line * adds instructions for handling gated model * handles gating for Mistral 7B in outlines example --- 06_gpu_and_ml/llm-serving/tgi_mixtral.py | 1 - 06_gpu_and_ml/llm-serving/vllm_inference.py | 9 +++++++++ 06_gpu_and_ml/llm-structured/outlines_generate.py | 14 ++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/tgi_mixtral.py b/06_gpu_and_ml/llm-serving/tgi_mixtral.py index c4313043c..5ca7da284 100644 --- a/06_gpu_and_ml/llm-serving/tgi_mixtral.py +++ b/06_gpu_and_ml/llm-serving/tgi_mixtral.py @@ -48,7 +48,6 @@ # We can use the included utilities to download the model weights (and convert to safetensors, if necessary) # as part of the image build. # -# # For this step to work on a [gated model](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/gated_model_access) # like Mixtral 8x7B, the `HF_TOKEN` environment variable must be set. # diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index 9f6b23a6a..3f67aa908 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -36,6 +36,13 @@ # ### Download the weights # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. # +# For this step to work on a [gated model](https://huggingface.co/docs/hub/en/models-gated) +# like Mistral 7B, the `HF_TOKEN` environment variable must be set. +# +# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens) +# and accepting the [terms of use](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), +# head to the [secrets page](https://modal.com/secrets) to share it with Modal as `huggingface-secret`. +# # Tip: avoid using global variables in this function. # Changes to code outside this function will not be detected, and the download step will not re-run. def download_model_to_image(model_dir, model_name): @@ -48,6 +55,7 @@ def download_model_to_image(model_dir, model_name): model_name, local_dir=model_dir, ignore_patterns=["*.pt", "*.bin"], # Using safetensors + token=os.environ["HF_TOKEN"], ) move_cache() @@ -71,6 +79,7 @@ def download_model_to_image(model_dir, model_name): download_model_to_image, timeout=60 * 20, kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME}, + secrets=[modal.Secret.from_name("huggingface-secret")], ) ) diff --git a/06_gpu_and_ml/llm-structured/outlines_generate.py b/06_gpu_and_ml/llm-structured/outlines_generate.py index 19e7ae763..b54acadbf 100644 --- a/06_gpu_and_ml/llm-structured/outlines_generate.py +++ b/06_gpu_and_ml/llm-structured/outlines_generate.py @@ -24,7 +24,7 @@ # First, you'll want to build an image and install the relevant Python dependencies: # `outlines` and a Hugging Face inference stack. -from modal import App, Image, gpu +from modal import App, Image, Secret, gpu app = App( name="outlines-app" @@ -42,6 +42,13 @@ # Next, we download the Mistral-7B model from Hugging Face. # We do this as part of the definition of our Modal image so that # we don't need to download it every time our inference function is run. +# +# For this step to work on a [gated model](https://huggingface.co/docs/hub/en/models-gated) +# like Mistral 7B, the `HF_TOKEN` environment variable must be set. +# +# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens) +# and accepting the [terms of use](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), +# head to the [secrets page](https://modal.com/secrets) to share it with Modal as `huggingface-secret`. def import_model(): @@ -50,7 +57,10 @@ def import_model(): outlines.models.transformers("mistralai/Mistral-7B-v0.1") -outlines_image = outlines_image.run_function(import_model) +outlines_image = outlines_image.run_function( + import_model, + secrets=[Secret.from_name("huggingface-secret")], +) # ## Define the schema From ad9346a7bf38272470ce20e1a3c6d4f578b2cd2c Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 2 May 2024 20:09:25 -0700 Subject: [PATCH 10/17] fixes relative path between instructor_generate and utils (#728) --- 06_gpu_and_ml/llm-structured/instructor_generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/06_gpu_and_ml/llm-structured/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor_generate.py index 242f419d3..57ba0ed0d 100644 --- a/06_gpu_and_ml/llm-structured/instructor_generate.py +++ b/06_gpu_and_ml/llm-structured/instructor_generate.py @@ -206,7 +206,7 @@ def get_examples(silent=True): We use importlib to avoid the need to define the repo as a package.""" import importlib - examples_root = Path(__file__).parent.parent.parent.parent + examples_root = Path(__file__).parent.parent.parent spec = importlib.util.spec_from_file_location( "utils", f"{examples_root}/internal/utils.py" ) From 8ff22cc373be59f9331f55c7aff799a41b7c0360 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 2 May 2024 22:34:50 -0700 Subject: [PATCH 11/17] centers logo, positions prisms, nicer material, match CPU + GPU throughput (#729) --- 06_gpu_and_ml/blender/blender_video.py | 63 +++++++++++++++++++++----- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py index ee1ed85a7..c07547691 100644 --- a/06_gpu_and_ml/blender/blender_video.py +++ b/06_gpu_and_ml/blender/blender_video.py @@ -6,9 +6,9 @@ # This example shows how you can render an animated 3D scene using # [Blender](https://www.blender.org/)'s Python interface. # -# You can run it on CPUs to scale out on one hundred of containers +# You can run it on CPUs to scale out on one hundred containers # or run it on GPUs to get higher throughput per node. -# Even with this simple scene, GPUs render 2x faster than CPUs. +# Even with this simple scene, GPUs render 10x faster than CPUs. # # The final render looks something like this: # @@ -60,7 +60,7 @@ @app.function( - gpu="T4" if WITH_GPU else None, + gpu="A10G" if WITH_GPU else None, concurrency_limit=10 if WITH_GPU else 100, # default limits on Modal free tier @@ -104,12 +104,33 @@ def render(angle: int = 0) -> bytes: # add the Modal logo: two neon green rectangular prisms iridescent_material = create_iridescent_material() - add_prism(ctx, (-1, 0, 0), 45, angle, iridescent_material) - add_prism(ctx, (3, 0, 0), -45, angle, iridescent_material) + add_prism(ctx, (-2.07, -1, 0), 45, angle, iridescent_material) + add_prism(ctx, (2.07, 1, 0), -45, angle, iridescent_material) - # set up the lighting and camera + # set up the lighting + # warm key light bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) - bpy.context.object.data.energy = 10 + key_light = bpy.context.object + key_light.data.energy = 100 + key_light.data.color = (1, 0.8, 0.5) # warm + + # tight, cool spotlight + bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6)) + spot_light = bpy.context.object + spot_light.data.energy = 500 + spot_light.data.spot_size = 0.5 + spot_light.data.color = (0.8, 0.8, 1) # cool + spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4) + + # soft overall illumination + bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5)) + area_light = bpy.context.object + area_light.data.energy = 50 # softer + area_light.data.size = 5 # larger + area_light.data.color = (1, 1, 1) # neutral + area_light.rotation_euler = (3.14 / 2, 0, 3.14) + + # add camera bpy.ops.object.camera_add(location=(7, -7, 5)) scene.camera = bpy.context.object ctx.object.rotation_euler = (1.1, 0, 0.785) @@ -221,8 +242,8 @@ def combine( # # The bytes for the video come back to our local machine, and we write them to a file. # -# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 T4 GPUs, -# with a per-frame latency of under 10 seconds, and about two minutes to run on 100 CPUs, with a per-frame latency of about 30 seconds. +# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 A10G GPUs, +# with a per-frame latency of about 10 seconds, and about five minutes to run on 100 CPUs, with a per-frame latency of about one minute. @app.local_entrypoint() @@ -251,6 +272,11 @@ def add_prism(ctx, location, initial_rotation, angle, material): bpy.ops.mesh.primitive_cube_add(size=2, location=location) obj = ctx.object # the newly created object + bevel = obj.modifiers.new(name="Bevel", type="BEVEL") + bevel.width = 0.2 + bevel.segments = 5 + bevel.profile = 1.0 + # assign the material to the object obj.data.materials.append(material) @@ -278,13 +304,22 @@ def create_iridescent_material(): nodes.clear() - output_node = nodes.new(type="ShaderNodeOutputMaterial") + principled_node = nodes.new(type="ShaderNodeBsdfPrincipled") + emission_node = nodes.new(type="ShaderNodeEmission") layer_weight = nodes.new(type="ShaderNodeLayerWeight") color_ramp = nodes.new(type="ShaderNodeValToRGB") + mix_shader_node = nodes.new(type="ShaderNodeMixShader") + + output_node = nodes.new(type="ShaderNodeOutputMaterial") + + principled_node.inputs["Base Color"].default_value = (1, 1, 1, 1) + principled_node.inputs["Metallic"].default_value = 1.0 + principled_node.inputs["Roughness"].default_value = 0.5 + color_ramp.color_ramp.elements[0].color = (0, 0, 0, 1) - color_ramp.color_ramp.elements[1].color = (0, 1, 0, 1) + color_ramp.color_ramp.elements[1].color = (0, 0.5, 0, 1) layer_weight.inputs["Blend"].default_value = 0.4 links.new(layer_weight.outputs["Fresnel"], color_ramp.inputs["Fac"]) @@ -293,6 +328,10 @@ def create_iridescent_material(): emission_node.inputs["Strength"].default_value = 5.0 emission_node.inputs["Color"].default_value = (0.0, 1.0, 0.0, 1) - links.new(emission_node.outputs["Emission"], output_node.inputs["Surface"]) + links.new(emission_node.outputs["Emission"], mix_shader_node.inputs[1]) + links.new(principled_node.outputs["BSDF"], mix_shader_node.inputs[2]) + links.new(layer_weight.outputs["Fresnel"], mix_shader_node.inputs["Fac"]) + + links.new(mix_shader_node.outputs["Shader"], output_node.inputs["Surface"]) return mat From 03c44cb42a7440fc31ef00631f1a0cf0589161bb Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Fri, 3 May 2024 10:19:43 -0700 Subject: [PATCH 12/17] refactors lighting out of main render function (#730) --- 06_gpu_and_ml/blender/blender_video.py | 52 ++++++++++++++------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py index c07547691..c58c4cbf4 100644 --- a/06_gpu_and_ml/blender/blender_video.py +++ b/06_gpu_and_ml/blender/blender_video.py @@ -107,30 +107,8 @@ def render(angle: int = 0) -> bytes: add_prism(ctx, (-2.07, -1, 0), 45, angle, iridescent_material) add_prism(ctx, (2.07, 1, 0), -45, angle, iridescent_material) - # set up the lighting - # warm key light - bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) - key_light = bpy.context.object - key_light.data.energy = 100 - key_light.data.color = (1, 0.8, 0.5) # warm - - # tight, cool spotlight - bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6)) - spot_light = bpy.context.object - spot_light.data.energy = 500 - spot_light.data.spot_size = 0.5 - spot_light.data.color = (0.8, 0.8, 1) # cool - spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4) - - # soft overall illumination - bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5)) - area_light = bpy.context.object - area_light.data.energy = 50 # softer - area_light.data.size = 5 # larger - area_light.data.color = (1, 1, 1) # neutral - area_light.rotation_euler = (3.14 / 2, 0, 3.14) - - # add camera + # add lighting and camera + add_lighting() bpy.ops.object.camera_add(location=(7, -7, 5)) scene.camera = bpy.context.object ctx.object.rotation_euler = (1.1, 0, 0.785) @@ -335,3 +313,29 @@ def create_iridescent_material(): links.new(mix_shader_node.outputs["Shader"], output_node.inputs["Surface"]) return mat + + +def add_lighting(): + import bpy + + # warm key light + bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) + key_light = bpy.context.object + key_light.data.energy = 100 + key_light.data.color = (1, 0.8, 0.5) # warm + + # tight, cool spotlight + bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6)) + spot_light = bpy.context.object + spot_light.data.energy = 500 + spot_light.data.spot_size = 0.5 + spot_light.data.color = (0.8, 0.8, 1) # cool + spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4) + + # soft overall illumination + bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5)) + area_light = bpy.context.object + area_light.data.energy = 50 # softer + area_light.data.size = 5 # larger + area_light.data.color = (1, 1, 1) # neutral + area_light.rotation_euler = (3.14 / 2, 0, 3.14) From 5923bff5ab734633ae06b6ab4493838014794d06 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Fri, 3 May 2024 10:36:41 -0700 Subject: [PATCH 13/17] adds rate limit handler from slack SDK (#731) --- 10_integrations/webscraper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/10_integrations/webscraper.py b/10_integrations/webscraper.py index e85135b08..d817b323e 100644 --- a/10_integrations/webscraper.py +++ b/10_integrations/webscraper.py @@ -39,7 +39,9 @@ async def get_links(url: str) -> set[str]: return set(links) -slack_sdk_image = modal.Image.debian_slim().pip_install("slack-sdk") +slack_sdk_image = modal.Image.debian_slim(python_version="3.10").pip_install( + "slack-sdk==3.27.1" +) @app.function( @@ -48,9 +50,13 @@ async def get_links(url: str) -> set[str]: ) def bot_token_msg(channel, message): import slack_sdk + from slack_sdk.http_retry.builtin_handlers import RateLimitErrorRetryHandler - print(f"Posting {message} to #{channel}") client = slack_sdk.WebClient(token=os.environ["SLACK_BOT_TOKEN"]) + rate_limit_handler = RateLimitErrorRetryHandler(max_retry_count=3) + client.retry_handlers.append(rate_limit_handler) + + print(f"Posting {message} to #{channel}") client.chat_postMessage(channel=channel, text=message) From e0b46deb9889d25832fb392307e9fdccb52d3528 Mon Sep 17 00:00:00 2001 From: Talha SARI Date: Sun, 5 May 2024 04:00:56 +0300 Subject: [PATCH 14/17] Fix whisper streaming (#733) * change endpoint name to transcribe to match example usage * add remote method to modal function usage * use aio to convert synch map into asynch * minor fix * change sleep to 0, fixed the curl giving error otherwise * correct old typo --------- Co-authored-by: Charles Frye --- 06_gpu_and_ml/openai_whisper/streaming/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/06_gpu_and_ml/openai_whisper/streaming/main.py b/06_gpu_and_ml/openai_whisper/streaming/main.py index cc8ae23b3..676d2b485 100644 --- a/06_gpu_and_ml/openai_whisper/streaming/main.py +++ b/06_gpu_and_ml/openai_whisper/streaming/main.py @@ -183,16 +183,16 @@ async def stream_whisper(audio_data: bytes): f.flush() segment_gen = split_silences(f.name) - for result in transcribe_segment.starmap( + async for result in transcribe_segment.starmap( segment_gen, kwargs=dict(audio_data=audio_data, model="base.en") ): - # Must cooperatively yeild here otherwise `StreamingResponse` will not iteratively return stream parts. - # see: https://github.com/python/asyncio/issues/284 - await asyncio.sleep(0.5) + # Must cooperatively yield here otherwise `StreamingResponse` will not iteratively return stream parts. + # see: https://github.com/python/asyncio/issues/284#issuecomment-154162668 + await asyncio.sleep(0) yield result["text"] -@web_app.get("/") +@web_app.get("/transcribe") async def transcribe(url: str): """ Usage: @@ -213,7 +213,7 @@ async def transcribe(url: str): print(f"downloading {url}") try: - audio_data = download_mp3_from_youtube(url) + audio_data = download_mp3_from_youtube.remote(url) except pytube.exceptions.RegexMatchError: raise HTTPException( status_code=422, detail=f"Could not process url {url}" From a238c9758583ccaeccdcbc217dddee75651cf26e Mon Sep 17 00:00:00 2001 From: bofeng huang Date: Sun, 5 May 2024 03:08:48 +0200 Subject: [PATCH 15/17] Fix vLLM template (#734) * Update vllm_mixtral.py * Fix template * Fix template --- 06_gpu_and_ml/llm-serving/vllm_gemma.py | 2 +- 06_gpu_and_ml/llm-serving/vllm_inference.py | 4 ++-- 06_gpu_and_ml/llm-serving/vllm_mixtral.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py index 2a3545961..634c6d47a 100644 --- a/06_gpu_and_ml/llm-serving/vllm_gemma.py +++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py @@ -121,7 +121,7 @@ class Model: @modal.enter() def load(self): self.template = ( - "start_of_turn>user\n{user}\nmodel" + "user\n{user}\nmodel\n" ) # Load the model. Tip: Some models, like MPT, may require `trust_remote_code=true`. diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index 3f67aa908..c24e345db 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -109,11 +109,11 @@ class Model: def load_model(self): # Tip: models that are not fully implemented by Hugging Face may require `trust_remote_code=true`. self.llm = vllm.LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count) - self.template = """[INST] <> + self.template = """[INST] <> {system} <> -{user} [/INST] """ +{user} [/INST]""" @modal.method() def generate(self, user_questions): diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py index 57618ae28..eb236b9cb 100644 --- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py +++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py @@ -121,7 +121,7 @@ def start_engine(self): disable_log_stats=True, # disable logging so we can stream tokens disable_log_requests=True, ) - self.template = " [INST] {user} [/INST] " + self.template = "[INST] {user} [/INST]" # this can take some time! self.engine = AsyncLLMEngine.from_engine_args(engine_args) From 2ac53ebc35b38e30d2288efb3cecaf41f19c8733 Mon Sep 17 00:00:00 2001 From: Akshat Bubna Date: Mon, 6 May 2024 00:10:06 -0400 Subject: [PATCH 16/17] install numpy explicitly in wikipedia example (#736) --- 06_gpu_and_ml/embeddings/wikipedia/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/06_gpu_and_ml/embeddings/wikipedia/main.py b/06_gpu_and_ml/embeddings/wikipedia/main.py index 95d898c22..0c3ffb5cc 100644 --- a/06_gpu_and_ml/embeddings/wikipedia/main.py +++ b/06_gpu_and_ml/embeddings/wikipedia/main.py @@ -78,7 +78,7 @@ def spawn_server() -> subprocess.Popen: add_python="3.10", ) .dockerfile_commands("ENTRYPOINT []") - .pip_install("httpx") + .pip_install("httpx", "numpy") ) with tei_image.imports(): From 75d6c997ecedb953e81b5b00cadf677eb96aed9f Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Mon, 6 May 2024 10:44:15 -0700 Subject: [PATCH 17/17] Run examples on change (#735) * inital draft of action to test monitoring * cleans up monitoring workflow * more complete draft of monitoring test action * removes draft monitoring workflow, reorganizes existing workflows * update internal development requirements * turn off dry run now that we're going back to prod * reorganize environment setup * WIP version of example execution * adds .secrets file from act * handles modal serve, proper system exit, drops extra script * updates actions, better environment setup * handle PRs with no changed files * add back dev dependencies for jupytext and pydantic in deploy * reverts changes to typechecking to avoid slowdown --- .github/actions/setup/action.yml | 36 ++++++++++++++ .github/workflows/cd.yml | 8 +--- .github/workflows/check.yml | 27 +++-------- .github/workflows/run-examples.yml | 76 ++++++++++++++++++++++++++++++ .gitignore | 3 ++ internal/requirements.txt | 7 ++- internal/run_example.py | 50 ++++++++++++++++++++ 7 files changed, 179 insertions(+), 28 deletions(-) create mode 100644 .github/actions/setup/action.yml create mode 100644 .github/workflows/run-examples.yml create mode 100644 internal/run_example.py diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml new file mode 100644 index 000000000..0312efef1 --- /dev/null +++ b/.github/actions/setup/action.yml @@ -0,0 +1,36 @@ +name: setup + +description: Set up a Python environment for the examples. + +inputs: + version: + description: Which Python version to install + required: false + default: "3.11" + devDependencies: + description: Whether to skip dependencies + required: false + default: "no-skip" + +runs: + using: composite + steps: + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.version }} + + - name: Install base packages + shell: bash + run: | + pip install uv + uv pip install --system setuptools wheel + + - name: Install development Python packages + if: ${{ inputs.devDependencies != 'skip' }} + shell: bash + run: uv pip install --system -r internal/requirements.txt + + - name: Install the modal client + shell: bash + run: uv pip install --system modal diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 50ab209a0..451c08f46 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -17,13 +17,9 @@ jobs: steps: - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 with: - python-version: "3.9" - - - name: Install Modal client package and jupytext - run: pip install modal-client jupytext pydantic~=1.10 + fetch-depth: 1 + - uses: ./.github/actions/setup - name: Run deployment script run: | diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 389875d8d..9f058e4c0 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -13,13 +13,9 @@ jobs: steps: - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 with: - python-version: "3.11" - - # keep version here in sync with .pre-commit-config.yaml and other modal repos - - run: pip install ruff==0.2.1 + fetch-depth: 1 + - uses: ./.github/actions/setup - run: ruff check @@ -31,16 +27,14 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 with: - python-version: "3.11" - - name: Install NbConvert - run: pip install jupyter nbconvert + fetch-depth: 1 + - uses: ./.github/actions/setup - name: Check notebooks are cleaned run: | jupyter nbconvert --clear-output --inplace 11_notebooks/*.ipynb - git diff --quiet && git diff --cached --quiet || exit 1 + git diff --quiet 11_notebooks/*.ipynb && git diff --cached --quiet 11_notebooks/*.ipynb || exit 1 pytest: name: Pytest @@ -48,16 +42,9 @@ jobs: steps: - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 with: - python-version: "3.11" - - - name: Install dev dependencies - run: pip install pytest jupytext pydantic~=1.10 - - - name: Install the Modal client - run: pip install modal-client + fetch-depth: 1 + - uses: ./.github/actions/setup - name: Run run: pytest -v . diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml new file mode 100644 index 000000000..bf27d0adb --- /dev/null +++ b/.github/workflows/run-examples.yml @@ -0,0 +1,76 @@ +name: Run + +on: + pull_request: + branches: + - main + paths: + - "**.py" + push: + branches: + - main + paths: + - "**.py" + workflow_dispatch: + +# Cancel previous runs of the same PR but do not cancel previous runs on main +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +env: + TERM: linux + TERMINFO: /etc/terminfo + MODAL_TOKEN_ID: ${{ secrets.MODAL_MODAL_LABS_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_MODAL_LABS_TOKEN_SECRET }} + MODAL_ENVIRONMENT: main + +jobs: + # Output all changed files in a JSON format compatible with GitHub Actions job matrices + diff-matrix: + name: Generate matrix of changed examples + runs-on: ubuntu-20.04 + outputs: + matrix: ${{ steps.diff.outputs.all_changed_files }} + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Find changed examples + id: diff + uses: tj-actions/changed-files@v44 + with: + files: "**.py" + files_ignore: "internal/**,misc/**" + matrix: true + + - name: List all changed examples + run: echo '${{ steps.diff.outputs.all_changed_files }}' + + # Run each changed example, using the output of the previous step as a job matrix + run-changed: + name: Run changed example + needs: [diff-matrix] + if: + ${{ needs.diff-matrix.outputs.matrix != '[]' && + needs.diff-matrix.outputs.matrix != '' }} + runs-on: ubuntu-20.04 + strategy: + matrix: + file: ${{ fromJson(needs.diff-matrix.outputs.matrix) }} + fail-fast: false + + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + with: + fetch-depth: 1 + - uses: ./.github/actions/setup + + - name: Run example + run: | + echo "Running ${{ matrix.file }}" + stem=$(basename "${{ matrix.file }}" .py) + python3 -m internal.run_example $stem || exit $? diff --git a/.gitignore b/.gitignore index 53fe8b69e..3218fc050 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ venv .venv + +# secrets file for act, tool for local GitHub Actions testing +.secrets diff --git a/internal/requirements.txt b/internal/requirements.txt index 42bf85702..5c5120ec8 100644 --- a/internal/requirements.txt +++ b/internal/requirements.txt @@ -1,5 +1,8 @@ -modal pytest +jupyter +ipython +nbconvert jupytext~=1.16.1 pydantic~=1.10.14 -mypy==0.950 +mypy==1.2.0 +ruff==0.2.1 diff --git a/internal/run_example.py b/internal/run_example.py new file mode 100644 index 000000000..3b06a3cb0 --- /dev/null +++ b/internal/run_example.py @@ -0,0 +1,50 @@ +import os +import subprocess +import sys +import time + +from . import utils + +MINUTES = 60 +TIMEOUT = 12 * MINUTES + + +def run_script(example): + t0 = time.time() + + try: + print(f"cli args: {example.cli_args}") + process = subprocess.run( + example.cli_args, + env=os.environ | {"MODAL_SERVE_TIMEOUT": "5.0"}, + timeout=TIMEOUT, + ) + total_time = time.time() - t0 + if process.returncode == 0: + print(f"Success after {total_time:.2f}s :)") + else: + print( + f"Failed after {total_time:.2f}s with return code {process.returncode} :(" + ) + + returncode = process.returncode + + except subprocess.TimeoutExpired: + print(f"Past timeout of {TIMEOUT}s :(") + returncode = 999 + + return returncode + + +def run_single_example(stem): + examples = utils.get_examples() + for example in examples: + if stem == example.stem: + return run_script(example) + else: + print(f"Could not find example name {stem}") + return 0 + + +if __name__ == "__main__": + sys.exit(run_single_example(sys.argv[1]))