-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathmain.py
90 lines (75 loc) · 2.73 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from dotenv import load_dotenv
import cocoindex
def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
"""
Embed the text using a SentenceTransformer model.
This is a shared logic between indexing and querying, so extract it as a function.
"""
return text.transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"
)
)
@cocoindex.flow_def(name="TextEmbedding")
def text_embedding_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
"""
Define an example flow that embeds text into a vector database.
"""
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="markdown_files")
)
doc_embeddings = data_scope.add_collector()
with data_scope["documents"].row() as doc:
doc["chunks"] = doc["content"].transform(
cocoindex.functions.SplitRecursively(),
language="markdown",
chunk_size=2000,
chunk_overlap=500,
)
with doc["chunks"].row() as chunk:
chunk["embedding"] = text_to_embedding(chunk["text"])
doc_embeddings.collect(
id=cocoindex.GeneratedField.UUID,
filename=doc["filename"],
location=chunk["location"],
text=chunk["text"],
# 'text_embedding' is the name of the vector we've created the Qdrant collection with.
text_embedding=chunk["embedding"],
)
doc_embeddings.export(
"doc_embeddings",
cocoindex.storages.Qdrant(
collection_name="cocoindex", grpc_url="http://localhost:6334/"
),
primary_key_fields=["id"],
setup_by_user=True,
)
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
name="SemanticsSearch",
flow=text_embedding_flow,
target_name="doc_embeddings",
query_transform_flow=text_to_embedding,
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
)
@cocoindex.main_fn()
def _run():
# Run queries in a loop to demonstrate the query capabilities.
while True:
try:
query = input("Enter search query (or Enter to quit): ")
if query == "":
break
results, _ = query_handler.search(query, 10, "text_embedding")
print("\nSearch results:")
for result in results:
print(f"[{result.score:.3f}] {result.data['filename']}")
print(f" {result.data['text']}")
print("---")
print()
except KeyboardInterrupt:
break
if __name__ == "__main__":
load_dotenv(override=True)
_run()