Skip to content

Commit 40c1da3

Browse files
pavanjavapavanmantha
and
pavanmantha
authored
-added api and docker capability of semantic cache and semantic router (#79)
Co-authored-by: pavanmantha <[email protected]>
1 parent e732c28 commit 40c1da3

File tree

9 files changed

+134
-21
lines changed

9 files changed

+134
-21
lines changed

bootstraprag/templates/qdrant/semantic_cache/.env

+6-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,9 @@ QDRANT_URL='http://localhost:6333'
22
QDRANT_API_KEY='th3s3cr3tk3y'
33

44
OLLAMA_MODEL='llama3.2:latest'
5-
OLLAMA_BASE_URL='http://localhost:11434'
5+
OLLAMA_BASE_URL='http://localhost:11434'
6+
7+
model_name_or_path='all-MiniLM-L6-v2'
8+
9+
LIT_SERVER_PORT=8000
10+
LIT_SERVER_WORKERS_PER_DEVICE=4
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
11
from abc import ABC
2+
from semantic_cache import SemanticCache, compute_response
23
import litserve as ls
4+
from dotenv import load_dotenv, find_dotenv
5+
import os
36

47

58
class SemanticCacheAPI(ls.LitAPI, ABC):
69
def __init__(self):
7-
pass
10+
load_dotenv(find_dotenv())
11+
self.semantic_cache: SemanticCache = None
812

913
def setup(self, device):
10-
pass
14+
self.semantic_cache = SemanticCache()
1115

1216
def decode_request(self, request, **kwargs):
13-
pass
17+
return request['question']
1418

15-
def predict(self, x, **kwargs):
16-
pass
19+
def predict(self, query, **kwargs):
20+
return self.semantic_cache.get_response(query=query, compute_response_func=compute_response)
1721

1822
def encode_response(self, output, **kwargs):
19-
pass
23+
return {"response": output}
24+
25+
26+
if __name__ == '__main__':
27+
api = SemanticCacheAPI()
28+
server = ls.LitServer(lit_api=api, api_path='/api/v1/chat-completion',
29+
workers_per_device=int(os.environ.get('LIT_SERVER_WORKERS_PER_DEVICE')))
30+
server.run(port=os.environ.get('LIT_SERVER_PORT'))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright The Lightning AI team.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import requests
15+
16+
response = requests.post("http://127.0.0.1:8000/predict", json={"input": 4.0})
17+
print(f"Status: {response.status_code}\nResponse:\n {response.text}")
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,16 @@
11
## Qdrant Semantic Cache
2+
Semantic Cache is a superfast cache mechanism on contextual meaning very much useful for LLM giving same response with out much deviation.
23

4+
### How to run
35
- `pip install -r requirements.txt`
4-
- `python semantic_cache.py`
6+
- `python semantic_cache.py`
7+
8+
### Expose Semantic Cache as API
9+
- `python api_server.py`
10+
```text
11+
API: http://localhost:8000/api/v1/chat-completion
12+
Method: POST
13+
payload: {
14+
"question": "what is the capital of India?"
15+
}
16+
```

bootstraprag/templates/qdrant/semantic_cache/semantic_cache.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def __init__(self, threshold=0.35):
1313
# load the data from env
1414
load_dotenv(find_dotenv())
1515

16-
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
16+
self.encoder = SentenceTransformer(model_name_or_path=os.environ.get('model_name_or_path'))
1717
self.cache_client = QdrantClient(url=os.environ.get('QDRANT_URL'), api_key=os.environ.get('QDRANT_API_KEY'))
1818
self.cache_collection_name = "cache"
1919
self.threshold = threshold
@@ -78,7 +78,7 @@ def compute_response(query: str):
7878
return f"Computed response for: {query} is {assistant_message}"
7979

8080

81-
semantic_cache = SemanticCache(threshold=0.8)
82-
query = "What is the capital of France?"
83-
response = semantic_cache.get_response(query, compute_response)
84-
print(response)
81+
# semantic_cache = SemanticCache(threshold=0.8)
82+
# query = "What is the capital of France?"
83+
# response = semantic_cache.get_response(query, compute_response)
84+
# print(response)
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
encoder_model='sentence-transformers/all-MiniLM-L6-v2'
22
qdrant_api_key='th3s3cr3tk3y'
3-
qdrant_url='http://localhost:6333/'
3+
qdrant_url='http://localhost:6333/'
4+
5+
LIT_SERVER_PORT=8000
6+
LIT_SERVER_WORKERS_PER_DEVICE=4
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,57 @@
11
from abc import ABC
2+
3+
from semantic_router import Route
4+
5+
from semantic_routing_core import SemanticRouter
26
import litserve as ls
7+
import os
38

49

510
class SemanticRoutingAPI(ls.LitAPI, ABC):
611
def __init__(self):
7-
pass
12+
self.semantic_routing_core = None
13+
# Define routes
14+
politics = Route(
15+
name="politics",
16+
utterances=[
17+
"isn't politics the best thing ever",
18+
"why don't you tell me about your political opinions",
19+
"don't you just love the president",
20+
"they're going to destroy this country!",
21+
"they will save the country!",
22+
],
23+
)
24+
25+
chitchat = Route(
26+
name="chitchat",
27+
utterances=[
28+
"how's the weather today?",
29+
"how are things going?",
30+
"lovely weather today",
31+
"the weather is horrendous",
32+
"let's go to the chippy",
33+
],
34+
)
35+
36+
self.routes = [politics, chitchat]
837

938
def setup(self, device):
10-
pass
39+
self.semantic_routing_core = SemanticRouter()
40+
# Set up routes
41+
self.semantic_routing_core.setup_routes(self.routes)
1142

1243
def decode_request(self, request, **kwargs):
13-
pass
44+
return request['question']
1445

15-
def predict(self, x, **kwargs):
16-
pass
46+
def predict(self, query, **kwargs):
47+
return self.semantic_routing_core.route_query(query=query)
1748

1849
def encode_response(self, output, **kwargs):
19-
pass
50+
return {'response': output}
51+
52+
53+
if __name__ == '__main__':
54+
api = SemanticRoutingAPI()
55+
server = ls.LitServer(lit_api=api, api_path='/api/v1/chat-completion',
56+
workers_per_device=int(os.environ.get('LIT_SERVER_WORKERS_PER_DEVICE')))
57+
server.run(port=os.environ.get('LIT_SERVER_PORT'))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright The Lightning AI team.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import requests
15+
16+
response = requests.post("http://127.0.0.1:8000/predict", json={"input": 4.0})
17+
print(f"Status: {response.status_code}\nResponse:\n {response.text}")

bootstraprag/templates/qdrant/semantic_routing/readme.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,14 @@ Semantic Router is a superfast decision-making layer for your LLMs and agents. R
33

44
### How to execute code
55
1. `pip install -r requirements.txt`
6-
2. `python main.py`
6+
2. `python main.py`
7+
8+
### Expose Semantic Router as API
9+
- `python api_server.py`
10+
```text
11+
API: http://localhost:8000/api/v1/chat-completion
12+
Method: POST
13+
payload: {
14+
"question": "what is the Weather today?"
15+
}
16+
```

0 commit comments

Comments
 (0)