Skip to content

Commit

Permalink
Fixes doc tree (#429)
Browse files Browse the repository at this point in the history
  • Loading branch information
luiscape authored Aug 28, 2023
1 parent c60814f commit ec398fa
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 168 deletions.
164 changes: 0 additions & 164 deletions 06_gpu_and_ml/mlc/mlc_inference.py

This file was deleted.

13 changes: 9 additions & 4 deletions 06_gpu_and_ml/mlc_inference.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# # Llama 2 inference with MLC
#
# [Machine Learning Compilation (MLC)](https://mlc.ai/mlc-llm/) is high-performance tool for serving
# LLMs including Llama 2. We will use the `mlc_chat` and the pre-compiled
# Llama 2 binaries to run inference using a Modal GPU.
# LLMs including Llama 2. We will use the [`mlc_chat`](https://mlc.ai/mlc-llm/docs/index.html) package
# and the pre-compiled Llama 2 binaries to run inference using a Modal GPU.
#
# This example is adapted from this [MLC chat collab](https://colab.research.google.com/github/mlc-ai/notebooks/blob/main/mlc-llm/tutorial_chat_module_getting_started.ipynb#scrollTo=yYwjsCOK7Jij).
import queue
Expand All @@ -12,6 +12,8 @@

import modal

# ## Imports and global settings
#
# Determine which [GPU](https://modal.com/docs/guide/gpu#gpu-acceleration) you want to use.
GPU: str = "a10g"

Expand Down Expand Up @@ -68,6 +70,8 @@
"""


# ## Define Modal function
#
# The `generate` function will load MLC chat and the compiled model into
# memory and run inference on an input prompt. This is a generator, streaming
# tokens back to the client as they are generated.
Expand Down Expand Up @@ -120,12 +124,13 @@ def _generate():
yield {"type": "output", "message": queue_callback.queue.get()}


# ## Run the model
# ## Run model
#
# Create a local Modal entrypoint that call sthe `generate` function.
# This uses the `curses` to render tokens as they are streamed back
# from Modal.
#
# Run this locally with `modal run -q mlc_inference.py`
# Run this locally with `modal run -q mlc_inference.py --prompt "What is serverless computing?"`
@stub.local_entrypoint()
def main(prompt: str):
import curses
Expand Down

0 comments on commit ec398fa

Please sign in to comment.