[python] added more examples and fix requirments version (#199)

vectorch-ai · May 20, 2024 · cbf6e81 · cbf6e81
1 parent 47e3b2f
commit cbf6e81
Show file tree

Hide file tree

Showing 8 changed files with 96 additions and 40 deletions.
diff --git a/python/scalellm/examples/async_stream_chat.py b/python/scalellm/examples/async_stream_chat.py
@@ -3,7 +3,7 @@
 
 def main():
     # Create an LLM engine.
-    engine = AsyncLLMEngine(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    engine = AsyncLLMEngine(model="google/gemma-1.1-2b-it")
     # start the engine loop
     engine.start()
 
@@ -25,20 +25,25 @@ def main():
 
         # append the user message
         messages.append(Message(role="user", content=prompt))
-
-        output_stream = engine.schedule_chat(
-            messages=messages,
-            sampling_params=sampling_params,
-            stream=True,
-        )
-        assistant_response = ""
-        print("\n[Assistant]: ", end="", flush=True)
-        for output in output_stream:
-            if len(output.outputs) > 0:
-                response = output.outputs[0].text
-                assistant_response += response
-                print(response, end="", flush=True)
-        print()
+
+        try:
+            output_stream = engine.schedule_chat(
+                messages=messages,
+                sampling_params=sampling_params,
+                stream=True,
+            )
+            assistant_response = ""
+            print("\n[Assistant]: ", end="", flush=True)
+            for output in output_stream:
+                if len(output.outputs) > 0:
+                    response = output.outputs[0].text
+                    assistant_response += response
+                    print(response, end="", flush=True)
+            print()
+        except KeyboardInterrupt:
+            # cancel the request
+            output_stream.cancel()
+            break
 
         # append the assistant message
         messages.append(Message(role="assistant", content=assistant_response))
@@ -48,7 +53,4 @@ def main():
 
 
 if __name__ == "__main__":
-    try:
-        main()
-    except KeyboardInterrupt:
-        pass
+    main()
diff --git a/python/scalellm/examples/async_stream_complete.py b/python/scalellm/examples/async_stream_complete.py
@@ -7,32 +7,34 @@ def main():
     # start the engine loop
     engine.start()
 
-    prompt = input("Enter a prompt: ")
+    prompt = input("\n[Prompt]: ")
     while True:
         if prompt == "exit":
             break
         sampling_params = SamplingParams(
             temperature=0, top_p=1.0, max_tokens=100, echo=True
         )
-        output_stream = engine.schedule(
-            prompt=prompt,
-            sampling_params=sampling_params,
-            stream=True,
-        )
-        for output in output_stream:
-            if len(output.outputs) > 0:
-                print(output.outputs[0].text, end="", flush=True)
-        print()
+        try:
+            output_stream = engine.schedule(
+                prompt=prompt,
+                sampling_params=sampling_params,
+                stream=True,
+            )
+            for output in output_stream:
+                if len(output.outputs) > 0:
+                    print(output.outputs[0].text, end="", flush=True)
+            print()
+        except KeyboardInterrupt:
+            # cancel the request
+            output_stream.cancel()
+            break
 
         # Get the next prompt.
-        prompt = input("Enter a prompt: ")
+        prompt = input("\n[Prompt]: ")
 
     # stop the engine
     engine.stop()
 
 
 if __name__ == "__main__":
-    try:
-        main()
-    except KeyboardInterrupt:
-        pass
+    main()
diff --git a/python/scalellm/examples/cpu_offline_inference.py b/python/scalellm/examples/cpu_offline_inference.py
@@ -0,0 +1,22 @@
+from scalellm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, echo=True)
+
+# Create an LLM.
+llm = LLM(model="gpt2", devices="cpu")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for i, output in enumerate(outputs):
+    generated_text = output.outputs[0].text
+    print(f"Generated text: {generated_text!r}")
diff --git a/python/scalellm/examples/offline_inference.py b/python/scalellm/examples/offline_inference.py
@@ -12,7 +12,7 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, echo=True)
 
 # Create an LLM.
-llm = LLM(model="gpt2")
+llm = LLM(model="gpt2", devices="cuda")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)

diff --git a/python/scalellm/examples/speculative_decoding.py b/python/scalellm/examples/speculative_decoding.py
@@ -0,0 +1,29 @@
+from scalellm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, echo=True)
+
+# Create an LLM.
+llm = LLM(
+    model="google/gemma-7b",
+    devices="cuda",
+    draft_model="google/gemma-2b",
+    draft_devices="cuda",
+    num_speculative_tokens=4,
+)
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for i, output in enumerate(outputs):
+    generated_text = output.outputs[0].text
+    print(f"Generated text: {generated_text!r}")
diff --git a/python/scalellm/llm_engine.py b/python/scalellm/llm_engine.py
@@ -43,7 +43,7 @@ def error(self, error: str) -> bool:
     # cancel the stream
     def cancel(self) -> None:
         self._cancelled = True
-        self._queue.put_nowait(None)
+        self._queue.put_nowait(StopIteration())
 
     def __iter__(self):
         return self
@@ -92,7 +92,7 @@ def error(self, error: str) -> bool:
     # cancel the stream
     def cancel(self) -> None:
         self._cancelled = True
-        self._queue.put_nowait(None)
+        self._queue.put_nowait(StopAsyncIteration())
 
     def __aiter__(self):
         return self

diff --git a/python/setup.py b/python/setup.py
@@ -220,6 +220,7 @@ def build_extension(self, ext: CMakeExtension):
     },
     classifiers=[
         "Development Status :: 3 - Alpha",
+        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
@@ -234,6 +235,6 @@ def build_extension(self, ext: CMakeExtension):
     package_data={
         "scalellm": scalellm_package_data,
     },
-    python_requires=">=3.9",
+    python_requires=">=3.8",
     install_requires=read_requirements(),
 )
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 torch >= 2.1.0
+fastapi >= 0.110.0
 huggingface_hub
-shortuuid
-fastapi
+shortuuid