-added api and docker capability of semantic cache and semantic router (#79)

pavanjava · pavanmantha · web-flow · commit 40c1da3e5582 · 2024-11-10T21:03:17.000+05:30
Co-authored-by: pavanmantha &lt;pavan.mantha@thevaslabs.io&gt;
diff --git a/bootstraprag/templates/qdrant/semantic_cache/.env b/bootstraprag/templates/qdrant/semantic_cache/.env
@@ -2,4 +2,9 @@ QDRANT_URL='http://localhost:6333'
 QDRANT_API_KEY='th3s3cr3tk3y'
 
 OLLAMA_MODEL='llama3.2:latest'
-OLLAMA_BASE_URL='http://localhost:11434'
+OLLAMA_BASE_URL='http://localhost:11434'
+
+model_name_or_path='all-MiniLM-L6-v2'
+
+LIT_SERVER_PORT=8000
+LIT_SERVER_WORKERS_PER_DEVICE=4
diff --git a/bootstraprag/templates/qdrant/semantic_cache/api_server.py b/bootstraprag/templates/qdrant/semantic_cache/api_server.py
@@ -1,19 +1,30 @@
 from abc import ABC
+from semantic_cache import SemanticCache, compute_response
 import litserve as ls
+from dotenv import load_dotenv, find_dotenv
+import os
 
 
 class SemanticCacheAPI(ls.LitAPI, ABC):
     def __init__(self):
-        pass
+        load_dotenv(find_dotenv())
+        self.semantic_cache: SemanticCache = None
 
     def setup(self, device):
-        pass
+        self.semantic_cache = SemanticCache()
 
     def decode_request(self, request, **kwargs):
-        pass
+        return request['question']
 
-    def predict(self, x, **kwargs):
-        pass
+    def predict(self, query, **kwargs):
+        return self.semantic_cache.get_response(query=query, compute_response_func=compute_response)
 
     def encode_response(self, output, **kwargs):
-        pass
+        return {"response": output}
+
+
+if __name__ == '__main__':
+    api = SemanticCacheAPI()
+    server = ls.LitServer(lit_api=api, api_path='/api/v1/chat-completion',
+                          workers_per_device=int(os.environ.get('LIT_SERVER_WORKERS_PER_DEVICE')))
+    server.run(port=os.environ.get('LIT_SERVER_PORT'))
diff --git a/bootstraprag/templates/qdrant/semantic_cache/client.py b/bootstraprag/templates/qdrant/semantic_cache/client.py
@@ -0,0 +1,17 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import requests
+
+response = requests.post("http://127.0.0.1:8000/predict", json={"input": 4.0})
+print(f"Status: {response.status_code}\nResponse:\n {response.text}")
diff --git a/bootstraprag/templates/qdrant/semantic_cache/readme.md b/bootstraprag/templates/qdrant/semantic_cache/readme.md
@@ -1,4 +1,16 @@
 ## Qdrant Semantic Cache
+Semantic Cache is a superfast cache mechanism on contextual meaning very much useful for LLM giving same response with out much deviation.
 
+### How to run
 - `pip install -r requirements.txt`
-- `python semantic_cache.py`
+- `python semantic_cache.py`
+
+### Expose Semantic Cache as API
+- `python api_server.py`
+```text
+API: http://localhost:8000/api/v1/chat-completion
+Method: POST
+payload: {
+  "question": "what is the capital of India?"
+}
+```
diff --git a/bootstraprag/templates/qdrant/semantic_cache/semantic_cache.py b/bootstraprag/templates/qdrant/semantic_cache/semantic_cache.py
@@ -13,7 +13,7 @@ def __init__(self, threshold=0.35):
         # load the data from env
         load_dotenv(find_dotenv())
 
-        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
+        self.encoder = SentenceTransformer(model_name_or_path=os.environ.get('model_name_or_path'))
         self.cache_client = QdrantClient(url=os.environ.get('QDRANT_URL'), api_key=os.environ.get('QDRANT_API_KEY'))
         self.cache_collection_name = "cache"
         self.threshold = threshold
@@ -78,7 +78,7 @@ def compute_response(query: str):
     return f"Computed response for: {query} is {assistant_message}"
 
 
-semantic_cache = SemanticCache(threshold=0.8)
-query = "What is the capital of France?"
-response = semantic_cache.get_response(query, compute_response)
-print(response)
+# semantic_cache = SemanticCache(threshold=0.8)
+# query = "What is the capital of France?"
+# response = semantic_cache.get_response(query, compute_response)
+# print(response)
diff --git a/bootstraprag/templates/qdrant/semantic_routing/.env b/bootstraprag/templates/qdrant/semantic_routing/.env
@@ -1,3 +1,6 @@
 encoder_model='sentence-transformers/all-MiniLM-L6-v2'
 qdrant_api_key='th3s3cr3tk3y'
-qdrant_url='http://localhost:6333/'
+qdrant_url='http://localhost:6333/'
+
+LIT_SERVER_PORT=8000
+LIT_SERVER_WORKERS_PER_DEVICE=4
diff --git a/bootstraprag/templates/qdrant/semantic_routing/api_server.py b/bootstraprag/templates/qdrant/semantic_routing/api_server.py
@@ -1,19 +1,57 @@
 from abc import ABC
+
+from semantic_router import Route
+
+from semantic_routing_core import SemanticRouter
 import litserve as ls
+import os
 
 
 class SemanticRoutingAPI(ls.LitAPI, ABC):
     def __init__(self):
-        pass
+        self.semantic_routing_core = None
+        # Define routes
+        politics = Route(
+            name="politics",
+            utterances=[
+                "isn't politics the best thing ever",
+                "why don't you tell me about your political opinions",
+                "don't you just love the president",
+                "they're going to destroy this country!",
+                "they will save the country!",
+            ],
+        )
+
+        chitchat = Route(
+            name="chitchat",
+            utterances=[
+                "how's the weather today?",
+                "how are things going?",
+                "lovely weather today",
+                "the weather is horrendous",
+                "let's go to the chippy",
+            ],
+        )
+
+        self.routes = [politics, chitchat]
 
     def setup(self, device):
-        pass
+        self.semantic_routing_core = SemanticRouter()
+        # Set up routes
+        self.semantic_routing_core.setup_routes(self.routes)
 
     def decode_request(self, request, **kwargs):
-        pass
+        return request['question']
 
-    def predict(self, x, **kwargs):
-        pass
+    def predict(self, query, **kwargs):
+        return self.semantic_routing_core.route_query(query=query)
 
     def encode_response(self, output, **kwargs):
-        pass
+        return {'response': output}
+
+
+if __name__ == '__main__':
+    api = SemanticRoutingAPI()
+    server = ls.LitServer(lit_api=api, api_path='/api/v1/chat-completion',
+                          workers_per_device=int(os.environ.get('LIT_SERVER_WORKERS_PER_DEVICE')))
+    server.run(port=os.environ.get('LIT_SERVER_PORT'))
diff --git a/bootstraprag/templates/qdrant/semantic_routing/client.py b/bootstraprag/templates/qdrant/semantic_routing/client.py
@@ -0,0 +1,17 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import requests
+
+response = requests.post("http://127.0.0.1:8000/predict", json={"input": 4.0})
+print(f"Status: {response.status_code}\nResponse:\n {response.text}")
diff --git a/bootstraprag/templates/qdrant/semantic_routing/readme.md b/bootstraprag/templates/qdrant/semantic_routing/readme.md
@@ -3,4 +3,14 @@ Semantic Router is a superfast decision-making layer for your LLMs and agents. R
 
 ### How to execute code
 1. `pip install -r requirements.txt`
-2. `python main.py`
+2. `python main.py`
+
+### Expose Semantic Router as API
+- `python api_server.py`
+```text
+API: http://localhost:8000/api/v1/chat-completion
+Method: POST
+payload: {
+  "question": "what is the Weather today?"
+}
+```