forked from llmware-ai/llmware
-
Notifications
You must be signed in to change notification settings - Fork 0
/
semantic_retrieval.py
82 lines (55 loc) · 2.71 KB
/
semantic_retrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
This 'getting started' example demonstrates how to use basic text retrieval with the Query class
1. Create a sample library
2. Run a basic text query
3. View the results
"""
import os
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
def create_fin_docs_sample_library(library_name):
print(f"update: creating library - {library_name}")
library = Library().create_new_library(library_name)
sample_files_path = Setup().load_sample_files(over_write=False)
ingestion_folder_path = os.path.join(sample_files_path, "FinDocs")
parsing_output = library.add_files(ingestion_folder_path)
print(f"update: building embeddings - may take a few minutes the first time")
# note: if you have installed Milvus, please substitute for 'FAISS'
# note: if you have any memory constraints on laptop:
# (1) reduce embedding batch_size or ...
# (2) substitute "mini-lm-sbert" as embedding model
library.install_new_embedding(embedding_model_name="industry-bert-sec", vector_db="faiss",batch_size=500)
return library
def basic_semantic_retrieval_example (library):
# Create a Query instance
q = Query(library)
# Set the keys that should be returned - optional - full set of keys will be returned by default
q.query_result_return_keys = ["distance","file_source", "page_num", "text"]
# perform a simple query
my_query = "ESG initiatives"
query_results1 = q.semantic_query(my_query, result_count=20)
# Iterate through query_results, which is a list of result dicts
print(f"\nQuery 1 - {my_query}")
for i, result in enumerate(query_results1):
print("results - ", i, result)
# perform another query
my_query2 = "stock performance"
query_results2 = q.semantic_query(my_query2, result_count=10)
print(f"\nQuery 2 - {my_query2}")
for i, result in enumerate(query_results2):
print("results - ", i, result)
# perform another query
my_query3 = "cloud computing"
# note: use of embedding_distance_threshold will cap results with distance < 1.0
query_results3 = q.semantic_query(my_query3, result_count=50, embedding_distance_threshold=1.0)
print(f"\nQuery 3 - {my_query3}")
for i, result in enumerate(query_results3):
print("result - ", i, result)
return [query_results1, query_results2, query_results3]
if __name__ == "__main__":
print(f"Example - Running a Basic Semantic Query")
# step 1- will create library + embeddings with Financial Docs
lib = create_fin_docs_sample_library("lib_semantic_query_1")
# step 2- run query against the library and embeddings
my_results = basic_semantic_retrieval_example(lib)