From e466dfea4ebca6addf487ffa60747cb4649f5e05 Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Sun, 12 Jan 2025 15:25:48 -0500
Subject: [PATCH 01/27] fix(agetns-api): init nlp pipeline text-search

---
 agents-api/agents_api/common/nlp.py | 100 ++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 00ba3d881..502726259 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -294,3 +294,103 @@ def batch_paragraphs_to_custom_queries(
         results.append(queries)
 
     return results
+
+@lru_cache(maxsize=1000)
+def text_to_tsvector_query(text: str, top_n: int = 10) -> str:
+    """
+    Converts text into a PostgreSQL tsquery format using sophisticated NLP processing.
+    Cached for repeated queries.
+    
+    Args:
+        text (str): Input text to convert
+        top_n (int): Number of top keywords to include
+        
+    Returns:
+        str: PostgreSQL tsquery compatible string
+    """
+    if not text or not text.strip():
+        return ""
+        
+    # Process text with spaCy
+    doc = nlp(text)
+    
+    # Extract important keywords using existing extract_keywords function
+    keywords = extract_keywords(doc, top_n=top_n, clean=True)
+    
+    if not keywords:
+        return ""
+    
+    # Find keyword positions using existing matcher
+    keyword_positions = keyword_matcher.find_matches(doc, keywords)
+    
+    if not keyword_positions:
+        return ""
+    
+    # Find proximity groups
+    groups = find_proximity_groups(keywords, keyword_positions, n=10)
+    
+    # Convert groups to tsquery format
+    tsquery_parts = []
+    
+    for group in groups:
+        if len(group) == 1:
+            # Single keyword
+            tsquery_parts.append(next(iter(group)))
+        else:
+            # For multiple keywords in proximity, use <-> operator in PostgreSQL
+            sorted_group = sorted(group, key=len, reverse=True)
+            tsquery_parts.append(
+                "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")"
+            )
+    
+    return " | ".join(tsquery_parts)
+
+def batch_text_to_tsvector_queries(
+    paragraphs: list[str],  # Changed to list since we don't need tuple for caching
+    top_n: int = 10,
+) -> list[str]:
+    """
+    Process multiple paragraphs into tsquery format efficiently.
+    
+    Args:
+        paragraphs (list[str]): List of paragraphs to process
+        top_n (int): Number of top keywords to include per paragraph
+        
+    Returns:
+        list[str]: List of tsquery strings
+    """
+    results = []
+
+    # Use spaCy's pipe for efficient batch processing
+    docs = nlp.pipe(paragraphs)
+
+    for doc in docs:
+        # Process each paragraph
+        keywords = extract_keywords(doc, top_n=top_n, clean=True)
+
+        if not keywords:
+            results.append("")
+            continue
+
+        keyword_positions = keyword_matcher.find_matches(doc, keywords)
+
+        if not keyword_positions:
+            results.append("")
+            continue
+
+        groups = find_proximity_groups(keywords, keyword_positions, n=10)
+
+        # Build tsquery for this paragraph
+        tsquery_parts = []
+        for group in groups:
+            if len(group) == 1:
+                tsquery_parts.append(next(iter(group)))
+            else:
+                sorted_group = sorted(group, key=len, reverse=True)
+                tsquery_parts.append(
+                    "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")"
+                )
+
+        results.append(" | ".join(tsquery_parts))
+
+    return results

From ba39b547c87b991899900caff528b5ec512cb4d4 Mon Sep 17 00:00:00 2001
From: Vedantsahai18 <Vedantsahai18@users.noreply.github.com>
Date: Sun, 12 Jan 2025 20:26:47 +0000
Subject: [PATCH 02/27] refactor: Lint agents-api (CI)

---
 agents-api/agents_api/common/nlp.py | 32 ++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 502726259..250eec6c5 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -295,43 +295,44 @@ def batch_paragraphs_to_custom_queries(
 
     return results
 
+
 @lru_cache(maxsize=1000)
 def text_to_tsvector_query(text: str, top_n: int = 10) -> str:
     """
     Converts text into a PostgreSQL tsquery format using sophisticated NLP processing.
     Cached for repeated queries.
-    
+
     Args:
         text (str): Input text to convert
         top_n (int): Number of top keywords to include
-        
+
     Returns:
         str: PostgreSQL tsquery compatible string
     """
     if not text or not text.strip():
         return ""
-        
+
     # Process text with spaCy
     doc = nlp(text)
-    
+
     # Extract important keywords using existing extract_keywords function
     keywords = extract_keywords(doc, top_n=top_n, clean=True)
-    
+
     if not keywords:
         return ""
-    
+
     # Find keyword positions using existing matcher
     keyword_positions = keyword_matcher.find_matches(doc, keywords)
-    
+
     if not keyword_positions:
         return ""
-    
+
     # Find proximity groups
     groups = find_proximity_groups(keywords, keyword_positions, n=10)
-    
+
     # Convert groups to tsquery format
     tsquery_parts = []
-    
+
     for group in groups:
         if len(group) == 1:
             # Single keyword
@@ -339,23 +340,22 @@ def text_to_tsvector_query(text: str, top_n: int = 10) -> str:
         else:
             # For multiple keywords in proximity, use <-> operator in PostgreSQL
             sorted_group = sorted(group, key=len, reverse=True)
-            tsquery_parts.append(
-                "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")"
-            )
-    
+            tsquery_parts.append("(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")")
+
     return " | ".join(tsquery_parts)
 
+
 def batch_text_to_tsvector_queries(
     paragraphs: list[str],  # Changed to list since we don't need tuple for caching
     top_n: int = 10,
 ) -> list[str]:
     """
     Process multiple paragraphs into tsquery format efficiently.
-    
+
     Args:
         paragraphs (list[str]): List of paragraphs to process
         top_n (int): Number of top keywords to include per paragraph
-        
+
     Returns:
         list[str]: List of tsquery strings
     """

From 8c3d6be2139bb0b958f3a5ac814c08c942901b6d Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Sun, 12 Jan 2025 23:21:21 -0500
Subject: [PATCH 03/27] chore: misc update

---
 agents-api/agents_api/common/nlp.py | 284 +++++++++++++++-------------
 1 file changed, 155 insertions(+), 129 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 250eec6c5..8a640a535 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -180,35 +180,162 @@ def union(u: str, v: str) -> None:
     return list(groups.values())
 
 
-def build_query_pattern(group_size: int, n: int) -> str:
-    """Cache query patterns for common group sizes."""
-    if group_size == 1:
-        return '"{}"'
-    return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")"
+# def build_query_pattern(group_size: int, n: int) -> str:
+#     """Cache query patterns for common group sizes."""
+#     if group_size == 1:
+#         return '"{}"'
+#     return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")"
+
+
+# def build_query(groups: list[set[str]], n: int = 10) -> str:
+#     """Build query with cached patterns."""
+#     clauses = []
+
+#     for group in groups:
+#         if len(group) == 1:
+#             clauses.append(f'"{next(iter(group))}"')
+#         else:
+#             # Sort by length descending to prioritize longer phrases
+#             sorted_group = sorted(group, key=len, reverse=True)
+#             # Get cached pattern and format with keywords
+#             pattern = build_query_pattern(len(group), n)
+#             clause = pattern.format(*sorted_group)
+#             clauses.append(clause)
+
+#     return " OR ".join(clauses)
+
+
+# @lru_cache(maxsize=100)
+# def paragraph_to_custom_queries(
+#     paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1
+# ) -> list[str]:
+#     """
+#     Optimized paragraph processing with minimal behavior changes.
+#     Added min_keywords parameter to filter out low-value queries.
+
+#     Args:
+#         paragraph (str): The input paragraph to convert.
+#         top_n (int): Number of top keywords to extract per sentence.
+#         proximity_n (int): The proximity window for NEAR/n.
+#         min_keywords (int): Minimum number of keywords required to form a query.
+
+#     Returns:
+#         list[str]: The list of custom query strings.
+#     """
+#     if not paragraph or not paragraph.strip():
+#         return []
+
+#     # Process entire paragraph once
+#     doc = nlp(paragraph)
+#     queries = []
+
+#     # Process sentences
+#     for sent in doc.sents:
+#         # Convert to doc for consistent API
+#         sent_doc = sent.as_doc()
+
+#         # Extract and clean keywords
+#         keywords = extract_keywords(sent_doc, top_n)
+#         if len(keywords) < min_keywords:
+#             continue
+
+#         # Find keyword positions using matcher
+#         keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
+
+#         # Skip if no keywords found in positions
+#         if not keyword_positions:
+#             continue
+
+#         # Find proximity groups and build query
+#         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
+#         query = build_query(groups, proximity_n)
+
+#         if query:
+#             queries.append(query)
+
+#     return queries
+
+
+# def batch_paragraphs_to_custom_queries(
+#     paragraphs: list[str],
+#     top_n: int = 10,
+#     proximity_n: int = 10,
+#     min_keywords: int = 1,
+#     n_process: int = 1,
+# ) -> list[list[str]]:
+#     """
+#     Processes multiple paragraphs using nlp.pipe for better performance.
+
+#     Args:
+#         paragraphs (list[str]): list of paragraphs to process.
+#         top_n (int): Number of top keywords to extract per sentence.
+#         proximity_n (int): The proximity window for NEAR/n.
+#         min_keywords (int): Minimum number of keywords required to form a query.
+#         n_process (int): Number of processes to use for multiprocessing.
+
+#     Returns:
+#         list[list[str]]: A list where each element is a list of queries for a paragraph.
+#     """
+#     results = []
+#     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
+#         queries = []
+#         for sent in doc.sents:
+#             sent_doc = sent.as_doc()
+#             keywords = extract_keywords(sent_doc, top_n)
+#             if len(keywords) < min_keywords:
+#                 continue
+#             keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
+#             if not keyword_positions:
+#                 continue
+#             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
+#             query = build_query(groups, proximity_n)
+#             if query:
+#                 queries.append(query)
+#         results.append(queries)
+
+#    return results
+
+
+def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str:
+    """
+    Builds a PostgreSQL tsquery string from groups of keywords.
 
+    Args:
+        groups (list[set[str]]): List of keyword groups
+        proximity_n (int): Maximum distance between words for proximity search
 
-def build_query(groups: list[set[str]], n: int = 10) -> str:
-    """Build query with cached patterns."""
-    clauses = []
+    Returns:
+        str: PostgreSQL tsquery compatible string
+    """
+    if not groups:
+        return ""
+
+    query_parts = []
 
     for group in groups:
+        if not group:  # Skip empty groups
+            continue
+
         if len(group) == 1:
-            clauses.append(f'"{next(iter(group))}"')
+            # Single word - just wrap in quotes
+            word = next(iter(group))
+            # No need to check for stopwords since they should be filtered earlier
+            query_parts.append(f"'{word.lower()}'")
         else:
-            # Sort by length descending to prioritize longer phrases
-            sorted_group = sorted(group, key=len, reverse=True)
-            # Get cached pattern and format with keywords
-            pattern = build_query_pattern(len(group), n)
-            clause = pattern.format(*sorted_group)
-            clauses.append(clause)
+            # Multiple words - sort by length (descending) and connect with <->
+            sorted_words = sorted(group, key=len, reverse=True)
+            filtered_words = [word.lower() for word in sorted_words]
+            if filtered_words:
+                phrase = " <-> ".join(f"'{word}'" for word in filtered_words)
+                query_parts.append(f"({phrase})")
 
-    return " OR ".join(clauses)
+    return " & ".join(query_parts) if query_parts else ""
 
 
-@lru_cache(maxsize=100)
-def paragraph_to_custom_queries(
+@lru_cache(maxsize=1000)
+def text_to_tsvector_query(
     paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1
-) -> list[str]:
+) -> str:
     """
     Optimized paragraph processing with minimal behavior changes.
     Added min_keywords parameter to filter out low-value queries.
@@ -219,8 +346,9 @@ def paragraph_to_custom_queries(
         proximity_n (int): The proximity window for NEAR/n.
         min_keywords (int): Minimum number of keywords required to form a query.
 
+
     Returns:
-        list[str]: The list of custom query strings.
+        str: PostgreSQL tsquery compatible string
     """
     if not paragraph or not paragraph.strip():
         return []
@@ -248,7 +376,7 @@ def paragraph_to_custom_queries(
 
         # Find proximity groups and build query
         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-        query = build_query(groups, proximity_n)
+        query = build_ts_query(groups, proximity_n)
 
         if query:
             queries.append(query)
@@ -256,7 +384,7 @@ def paragraph_to_custom_queries(
     return queries
 
 
-def batch_paragraphs_to_custom_queries(
+def batch_text_to_tsvector_queries(
     paragraphs: list[str],
     top_n: int = 10,
     proximity_n: int = 10,
@@ -267,16 +395,14 @@ def batch_paragraphs_to_custom_queries(
     Processes multiple paragraphs using nlp.pipe for better performance.
 
     Args:
-        paragraphs (list[str]): list of paragraphs to process.
-        top_n (int): Number of top keywords to extract per sentence.
-        proximity_n (int): The proximity window for NEAR/n.
-        min_keywords (int): Minimum number of keywords required to form a query.
-        n_process (int): Number of processes to use for multiprocessing.
+        paragraphs (list[str]): List of paragraphs to process
+        top_n (int): Number of top keywords to include per paragraph
 
     Returns:
-        list[list[str]]: A list where each element is a list of queries for a paragraph.
+        list[str]: List of tsquery strings
     """
     results = []
+
     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
         queries = []
         for sent in doc.sents:
@@ -288,109 +414,9 @@ def batch_paragraphs_to_custom_queries(
             if not keyword_positions:
                 continue
             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-            query = build_query(groups, proximity_n)
+            query = build_ts_query(groups, proximity_n)
             if query:
                 queries.append(query)
         results.append(queries)
 
     return results
-
-
-@lru_cache(maxsize=1000)
-def text_to_tsvector_query(text: str, top_n: int = 10) -> str:
-    """
-    Converts text into a PostgreSQL tsquery format using sophisticated NLP processing.
-    Cached for repeated queries.
-
-    Args:
-        text (str): Input text to convert
-        top_n (int): Number of top keywords to include
-
-    Returns:
-        str: PostgreSQL tsquery compatible string
-    """
-    if not text or not text.strip():
-        return ""
-
-    # Process text with spaCy
-    doc = nlp(text)
-
-    # Extract important keywords using existing extract_keywords function
-    keywords = extract_keywords(doc, top_n=top_n, clean=True)
-
-    if not keywords:
-        return ""
-
-    # Find keyword positions using existing matcher
-    keyword_positions = keyword_matcher.find_matches(doc, keywords)
-
-    if not keyword_positions:
-        return ""
-
-    # Find proximity groups
-    groups = find_proximity_groups(keywords, keyword_positions, n=10)
-
-    # Convert groups to tsquery format
-    tsquery_parts = []
-
-    for group in groups:
-        if len(group) == 1:
-            # Single keyword
-            tsquery_parts.append(next(iter(group)))
-        else:
-            # For multiple keywords in proximity, use <-> operator in PostgreSQL
-            sorted_group = sorted(group, key=len, reverse=True)
-            tsquery_parts.append("(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")")
-
-    return " | ".join(tsquery_parts)
-
-
-def batch_text_to_tsvector_queries(
-    paragraphs: list[str],  # Changed to list since we don't need tuple for caching
-    top_n: int = 10,
-) -> list[str]:
-    """
-    Process multiple paragraphs into tsquery format efficiently.
-
-    Args:
-        paragraphs (list[str]): List of paragraphs to process
-        top_n (int): Number of top keywords to include per paragraph
-
-    Returns:
-        list[str]: List of tsquery strings
-    """
-    results = []
-
-    # Use spaCy's pipe for efficient batch processing
-    docs = nlp.pipe(paragraphs)
-
-    for doc in docs:
-        # Process each paragraph
-        keywords = extract_keywords(doc, top_n=top_n, clean=True)
-
-        if not keywords:
-            results.append("")
-            continue
-
-        keyword_positions = keyword_matcher.find_matches(doc, keywords)
-
-        if not keyword_positions:
-            results.append("")
-            continue
-
-        groups = find_proximity_groups(keywords, keyword_positions, n=10)
-
-        # Build tsquery for this paragraph
-        tsquery_parts = []
-        for group in groups:
-            if len(group) == 1:
-                tsquery_parts.append(next(iter(group)))
-            else:
-                sorted_group = sorted(group, key=len, reverse=True)
-                tsquery_parts.append(
-                    "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")"
-                )
-
-        results.append(" | ".join(tsquery_parts))
-
-    return results

From 1d677a2589f079543c93ce4c46576b381917b9c4 Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Sun, 12 Jan 2025 23:21:45 -0500
Subject: [PATCH 04/27] feat(test): add new embeddings + FTS tests

---
 agents-api/tests/fixtures.py          |  11 +
 agents-api/tests/test_docs_queries.py | 800 +++++++++++++++++---------
 2 files changed, 553 insertions(+), 258 deletions(-)

diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py
index b14078d68..166bbef73 100644
--- a/agents-api/tests/fixtures.py
+++ b/agents-api/tests/fixtures.py
@@ -175,6 +175,17 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         f"[{', '.join([str(x) for x in [1.0] * 1024])}]",
     )
 
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 1, $3, $4)
+        """,  # Changed chunk_seq from 0 to 1
+        developer.id,
+        doc.id,
+        "Different test content",
+        f"[{', '.join([str(x) for x in [0.5] * 1024])}]",
+    )
+
     yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool)
 
 
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index 6690badfd..70f13a129 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -1,260 +1,251 @@
-from agents_api.autogen.openapi_model import CreateDocRequest
 from agents_api.clients.pg import create_db_pool
-from agents_api.queries.docs.create_doc import create_doc
-from agents_api.queries.docs.delete_doc import delete_doc
-from agents_api.queries.docs.get_doc import get_doc
-from agents_api.queries.docs.list_docs import list_docs
 from agents_api.queries.docs.search_docs_by_embedding import search_docs_by_embedding
-from agents_api.queries.docs.search_docs_by_text import search_docs_by_text
-from agents_api.queries.docs.search_docs_hybrid import search_docs_hybrid
 from ward import test
 
 from .fixtures import (
     pg_dsn,
     test_agent,
     test_developer,
-    test_doc,
     test_doc_with_embedding,
-    test_user,
 )
 
 EMBEDDING_SIZE: int = 1024
 
 
-@test("query: create user doc")
-async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
-    pool = await create_db_pool(dsn=dsn)
-    doc_created = await create_doc(
-        developer_id=developer.id,
-        data=CreateDocRequest(
-            title="User Doc",
-            content=["Docs for user testing", "Docs for user testing 2"],
-            metadata={"test": "test"},
-            embed_instruction="Embed the document",
-        ),
-        owner_type="user",
-        owner_id=user.id,
-        connection_pool=pool,
-    )
-
-    assert doc_created.id is not None
-
-    # Verify doc appears in user's docs
-    found = await get_doc(
-        developer_id=developer.id,
-        doc_id=doc_created.id,
-        connection_pool=pool,
-    )
-    assert found.id == doc_created.id
-
-
-@test("query: create agent doc")
-async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
-    pool = await create_db_pool(dsn=dsn)
-    doc = await create_doc(
-        developer_id=developer.id,
-        data=CreateDocRequest(
-            title="Agent Doc",
-            content="Docs for agent testing",
-            metadata={"test": "test"},
-            embed_instruction="Embed the document",
-        ),
-        owner_type="agent",
-        owner_id=agent.id,
-        connection_pool=pool,
-    )
-    assert doc.id is not None
-
-    # Verify doc appears in agent's docs
-    docs_list = await list_docs(
-        developer_id=developer.id,
-        owner_type="agent",
-        owner_id=agent.id,
-        connection_pool=pool,
-    )
-    assert any(d.id == doc.id for d in docs_list)
-
-
-@test("query: get doc")
-async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc):
-    pool = await create_db_pool(dsn=dsn)
-    doc_test = await get_doc(
-        developer_id=developer.id,
-        doc_id=doc.id,
-        connection_pool=pool,
-    )
-    assert doc_test.id == doc.id
-    assert doc_test.title is not None
-    assert doc_test.content is not None
-
-
-@test("query: list user docs")
-async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
-    pool = await create_db_pool(dsn=dsn)
-
-    # Create a doc owned by the user
-    doc_user = await create_doc(
-        developer_id=developer.id,
-        data=CreateDocRequest(
-            title="User List Test",
-            content="Some user doc content",
-            metadata={"test": "test"},
-            embed_instruction="Embed the document",
-        ),
-        owner_type="user",
-        owner_id=user.id,
-        connection_pool=pool,
-    )
-
-    # List user's docs
-    docs_list = await list_docs(
-        developer_id=developer.id,
-        owner_type="user",
-        owner_id=user.id,
-        connection_pool=pool,
-    )
-    assert len(docs_list) >= 1
-    assert any(d.id == doc_user.id for d in docs_list)
-
-
-@test("query: list agent docs")
-async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
-    pool = await create_db_pool(dsn=dsn)
-
-    # Create a doc owned by the agent
-    doc_agent = await create_doc(
-        developer_id=developer.id,
-        data=CreateDocRequest(
-            title="Agent List Test",
-            content="Some agent doc content",
-            metadata={"test": "test"},
-            embed_instruction="Embed the document",
-        ),
-        owner_type="agent",
-        owner_id=agent.id,
-        connection_pool=pool,
-    )
-
-    # List agent's docs
-    docs_list = await list_docs(
-        developer_id=developer.id,
-        owner_type="agent",
-        owner_id=agent.id,
-        connection_pool=pool,
-    )
-    assert len(docs_list) >= 1
-    assert any(d.id == doc_agent.id for d in docs_list)
-
-
-@test("query: delete user doc")
-async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
-    pool = await create_db_pool(dsn=dsn)
-
-    # Create a doc owned by the user
-    doc_user = await create_doc(
-        developer_id=developer.id,
-        data=CreateDocRequest(
-            title="User Delete Test",
-            content="Doc for user deletion test",
-            metadata={"test": "test"},
-            embed_instruction="Embed the document",
-        ),
-        owner_type="user",
-        owner_id=user.id,
-        connection_pool=pool,
-    )
-
-    # Delete the doc
-    await delete_doc(
-        developer_id=developer.id,
-        doc_id=doc_user.id,
-        owner_type="user",
-        owner_id=user.id,
-        connection_pool=pool,
-    )
-
-    # Verify doc is no longer in user's docs
-    docs_list = await list_docs(
-        developer_id=developer.id,
-        owner_type="user",
-        owner_id=user.id,
-        connection_pool=pool,
-    )
-    assert not any(d.id == doc_user.id for d in docs_list)
-
-
-@test("query: delete agent doc")
-async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
-    pool = await create_db_pool(dsn=dsn)
-
-    # Create a doc owned by the agent
-    doc_agent = await create_doc(
-        developer_id=developer.id,
-        data=CreateDocRequest(
-            title="Agent Delete Test",
-            content="Doc for agent deletion test",
-            metadata={"test": "test"},
-            embed_instruction="Embed the document",
-        ),
-        owner_type="agent",
-        owner_id=agent.id,
-        connection_pool=pool,
-    )
-
-    # Delete the doc
-    await delete_doc(
-        developer_id=developer.id,
-        doc_id=doc_agent.id,
-        owner_type="agent",
-        owner_id=agent.id,
-        connection_pool=pool,
-    )
-
-    # Verify doc is no longer in agent's docs
-    docs_list = await list_docs(
-        developer_id=developer.id,
-        owner_type="agent",
-        owner_id=agent.id,
-        connection_pool=pool,
-    )
-    assert not any(d.id == doc_agent.id for d in docs_list)
-
-
-@test("query: search docs by text")
-async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer):
-    pool = await create_db_pool(dsn=dsn)
-
-    # Create a test document
-    doc = await create_doc(
-        developer_id=developer.id,
-        owner_type="agent",
-        owner_id=agent.id,
-        data=CreateDocRequest(
-            title="Hello",
-            content="The world is a funny little thing",
-            metadata={"test": "test"},
-            embed_instruction="Embed the document",
-        ),
-        connection_pool=pool,
-    )
-
-    # Search using simpler terms first
-    result = await search_docs_by_text(
-        developer_id=developer.id,
-        owners=[("agent", agent.id)],
-        query="world",
-        k=3,
-        search_language="english",
-        metadata_filter={"test": "test"},
-        connection_pool=pool,
-    )
-
-    print("\nSearch results:", result)
-
-    # More specific assertions
-    assert len(result) >= 1, "Should find at least one document"
-    assert any(d.id == doc.id for d in result), f"Should find document {doc.id}"
-    assert result[0].metadata == {"test": "test"}, "Metadata should match"
+# @test("query: create user doc")
+# async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
+#     pool = await create_db_pool(dsn=dsn)
+#     doc_created = await create_doc(
+#         developer_id=developer.id,
+#         data=CreateDocRequest(
+#             title="User Doc",
+#             content=["Docs for user testing", "Docs for user testing 2"],
+#             metadata={"test": "test"},
+#             embed_instruction="Embed the document",
+#         ),
+#         owner_type="user",
+#         owner_id=user.id,
+#         connection_pool=pool,
+#     )
+
+#     assert doc_created.id is not None
+
+#     # Verify doc appears in user's docs
+#     found = await get_doc(
+#         developer_id=developer.id,
+#         doc_id=doc_created.id,
+#         connection_pool=pool,
+#     )
+#     assert found.id == doc_created.id
+
+
+# @test("query: create agent doc")
+# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+#     pool = await create_db_pool(dsn=dsn)
+#     doc = await create_doc(
+#         developer_id=developer.id,
+#         data=CreateDocRequest(
+#             title="Agent Doc",
+#             content="Docs for agent testing",
+#             metadata={"test": "test"},
+#             embed_instruction="Embed the document",
+#         ),
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         connection_pool=pool,
+#     )
+#     assert doc.id is not None
+
+#     # Verify doc appears in agent's docs
+#     docs_list = await list_docs(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         connection_pool=pool,
+#     )
+#     assert any(d.id == doc.id for d in docs_list)
+
+
+# @test("query: get doc")
+# async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc):
+#     pool = await create_db_pool(dsn=dsn)
+#     doc_test = await get_doc(
+#         developer_id=developer.id,
+#         doc_id=doc.id,
+#         connection_pool=pool,
+#     )
+#     assert doc_test.id == doc.id
+#     assert doc_test.title is not None
+#     assert doc_test.content is not None
+
+
+# @test("query: list user docs")
+# async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Create a doc owned by the user
+#     doc_user = await create_doc(
+#         developer_id=developer.id,
+#         data=CreateDocRequest(
+#             title="User List Test",
+#             content="Some user doc content",
+#             metadata={"test": "test"},
+#             embed_instruction="Embed the document",
+#         ),
+#         owner_type="user",
+#         owner_id=user.id,
+#         connection_pool=pool,
+#     )
+
+#     # List user's docs
+#     docs_list = await list_docs(
+#         developer_id=developer.id,
+#         owner_type="user",
+#         owner_id=user.id,
+#         connection_pool=pool,
+#     )
+#     assert len(docs_list) >= 1
+#     assert any(d.id == doc_user.id for d in docs_list)
+
+
+# @test("query: list agent docs")
+# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Create a doc owned by the agent
+#     doc_agent = await create_doc(
+#         developer_id=developer.id,
+#         data=CreateDocRequest(
+#             title="Agent List Test",
+#             content="Some agent doc content",
+#             metadata={"test": "test"},
+#             embed_instruction="Embed the document",
+#         ),
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         connection_pool=pool,
+#     )
+
+#     # List agent's docs
+#     docs_list = await list_docs(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         connection_pool=pool,
+#     )
+#     assert len(docs_list) >= 1
+#     assert any(d.id == doc_agent.id for d in docs_list)
+
+
+# @test("query: delete user doc")
+# async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Create a doc owned by the user
+#     doc_user = await create_doc(
+#         developer_id=developer.id,
+#         data=CreateDocRequest(
+#             title="User Delete Test",
+#             content="Doc for user deletion test",
+#             metadata={"test": "test"},
+#             embed_instruction="Embed the document",
+#         ),
+#         owner_type="user",
+#         owner_id=user.id,
+#         connection_pool=pool,
+#     )
+
+#     # Delete the doc
+#     await delete_doc(
+#         developer_id=developer.id,
+#         doc_id=doc_user.id,
+#         owner_type="user",
+#         owner_id=user.id,
+#         connection_pool=pool,
+#     )
+
+#     # Verify doc is no longer in user's docs
+#     docs_list = await list_docs(
+#         developer_id=developer.id,
+#         owner_type="user",
+#         owner_id=user.id,
+#         connection_pool=pool,
+#     )
+#     assert not any(d.id == doc_user.id for d in docs_list)
+
+
+# @test("query: delete agent doc")
+# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Create a doc owned by the agent
+#     doc_agent = await create_doc(
+#         developer_id=developer.id,
+#         data=CreateDocRequest(
+#             title="Agent Delete Test",
+#             content="Doc for agent deletion test",
+#             metadata={"test": "test"},
+#             embed_instruction="Embed the document",
+#         ),
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         connection_pool=pool,
+#     )
+
+#     # Delete the doc
+#     await delete_doc(
+#         developer_id=developer.id,
+#         doc_id=doc_agent.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         connection_pool=pool,
+#     )
+
+#     # Verify doc is no longer in agent's docs
+#     docs_list = await list_docs(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         connection_pool=pool,
+#     )
+#     assert not any(d.id == doc_agent.id for d in docs_list)
+
+
+# @test("query: search docs by text")
+# async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Create a test document
+#     doc = await create_doc(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         data=CreateDocRequest(
+#             title="Hello",
+#             content="The world is a funny little thing",
+#             metadata={"test": "test"},
+#             embed_instruction="Embed the document",
+#         ),
+#         connection_pool=pool,
+#     )
+
+#     # Search using simpler terms first
+#     result = await search_docs_by_text(
+#         developer_id=developer.id,
+#         owners=[("agent", agent.id)],
+#         query="world",
+#         k=3,
+#         search_language="english",
+#         metadata_filter={"test": "test"},
+#         connection_pool=pool,
+#     )
+
+#     print("\nSearch results:", result)
+
+#     # More specific assertions
+#     assert len(result) >= 1, "Should find at least one document"
+#     assert any(d.id == doc.id for d in result), f"Should find document {doc.id}"
+#     assert result[0].metadata == {"test": "test"}, "Metadata should match"
 
 
 @test("query: search docs by embedding")
@@ -282,25 +273,318 @@ async def _(
     assert result[0].metadata is not None
 
 
-@test("query: search docs by hybrid")
+# @test("query: search docs by hybrid")
+# async def _(
+#     dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
+# ):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Get query embedding by averaging the embeddings (list of floats)
+#     query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)]
+
+#     # Search using the correct parameter types
+#     result = await search_docs_hybrid(
+#         developer_id=developer.id,
+#         owners=[("agent", agent.id)],
+#         text_query=doc.content[0] if isinstance(doc.content, list) else doc.content,
+#         embedding=query_embedding,
+#         k=3,  # Add k parameter
+#         metadata_filter={"test": "test"},  # Add metadata filter
+#         connection_pool=pool,
+#     )
+
+#     assert len(result) >= 1
+#     assert result[0].metadata is not None
+
+
+# @test("query: test tsvector with technical terms and phrases")
+# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Create documents with technical content
+#     doc1 = await create_doc(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         data=CreateDocRequest(
+#             title="Technical Document",
+#             content="API endpoints using REST architecture with JSON payloads",
+#             metadata={"domain": "technical"},
+#             embed_instruction="Embed the document",
+#         ),
+#         connection_pool=pool,
+#     )
+
+#     doc2 = await create_doc(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         data=CreateDocRequest(
+#             title="More Technical Terms",
+#             content="Database optimization using indexing and query planning",
+#             metadata={"domain": "technical"},
+#             embed_instruction="Embed the document",
+#         ),
+#         connection_pool=pool,
+#     )
+
+#     # Test with technical terms
+#     technical_queries = [
+#         "API endpoints",
+#         "REST architecture",
+#         "database optimization",
+#         "indexing"
+#     ]
+
+#     for query in technical_queries:
+#         results = await search_docs_by_text(
+#             developer_id=developer.id,
+#             owners=[("agent", agent.id)],
+#             query=query,
+#             k=3,
+#             search_language="english",
+#             connection_pool=pool,
+#         )
+
+#         print(f"\nSearch results for '{query}':", results)
+
+#         # Verify appropriate document is found based on query
+#         if "API" in query or "REST" in query:
+#             assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'"
+#         if "database" in query.lower() or "indexing" in query:
+#             assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'"
+
+# @test("query: test tsvector with varying content lengths and special characters")
+# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+#     pool = await create_db_pool(dsn=dsn)
+
+#     # Create documents with different content lengths
+#     short_doc = await create_doc(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         data=CreateDocRequest(
+#             title="Short",
+#             content="Brief test document",
+#             metadata={"length": "short"},
+#             embed_instruction="Embed the document",
+#         ),
+#         connection_pool=pool,
+#     )
+
+#     medium_doc = await create_doc(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         data=CreateDocRequest(
+#             title="Medium",
+#             content="This is a medium length document that contains more words and context for testing purposes",
+#             metadata={"length": "medium"},
+#             embed_instruction="Embed the document",
+#         ),
+#         connection_pool=pool,
+#     )
+
+# long_doc = await create_doc(
+#     developer_id=developer.id,
+#     owner_type="agent",
+#     owner_id=agent.id,
+#     data=CreateDocRequest(
+#         title="Long",
+#         content="This is a much longer document that contains multiple sentences. It includes various terms and phrases. \
+#         The purpose is to test how the search handles longer content with more context. \
+#         It should still be able to find relevant matches based on the search query.",
+#         metadata={"length": "long"},
+#         embed_instruction="Embed the document",
+#     ),
+#     connection_pool=pool,
+# )
+
+#     special_doc = await create_doc(
+#         developer_id=developer.id,
+#         owner_type="agent",
+#         owner_id=agent.id,
+#         data=CreateDocRequest(
+#             title="Special Characters",
+#             content="Testing! With? Different... punctuation; marks: and-hyphens, plus+signs & ampersands",
+#             metadata={"type": "special"},
+#             embed_instruction="Embed the document",
+#         ),
+#         connection_pool=pool,
+#     )
+
+#     # Test cases for different content lengths
+#     length_test_cases = [
+#         ("brief test", short_doc.id),
+#         ("medium length document", medium_doc.id),
+#         ("multiple sentences", long_doc.id),
+#         ("document", None)  # Should find all documents
+#     ]
+
+#     for query, expected_doc_id in length_test_cases:
+#         results = await search_docs_by_text(
+#             developer_id=developer.id,
+#             owners=[("agent", agent.id)],
+#             query=query,
+#             k=3,
+#             search_language="english",
+#             connection_pool=pool,
+#         )
+
+#         print(f"\nSearch results for '{query}':", results)
+
+#         if expected_doc_id:
+#             assert any(doc.id == expected_doc_id for doc in results), \
+#                 f"Expected document should be found with query '{query}'"
+#         else:
+#             # For general terms, verify multiple documents are found
+#             assert len(results) > 1, f"Multiple documents should be found with query '{query}'"
+
+# @test("query: test direct tsvector generation")
+# async def _():
+#     test_cases = [
+#         # Single words
+#         (
+#             "test",
+#             "'test'"
+#         ),
+#         (
+#             "testing",
+#             "'testing'"
+#         ),
+
+#         # Multiple words in single sentence
+#         (
+#             "quick brown fox",
+#             "'quick' & 'brown' & 'fox'"
+#         ),
+#         (
+#             "The Quick Brown Fox",
+#             "'quick' & 'brown' & 'fox'"
+#         ),
+
+#         # Technical terms and phrases
+#         (
+#             "machine learning algorithm",
+#             "('machine' <-> 'learning') & 'algorithm'"
+#         ),
+#         (
+#             "REST API implementation",
+#             "'rest' & 'api' & 'implementation'"
+#         ),
+
+#         # Multiple sentences
+#         (
+#             "Machine learning is great. Data science rocks.",
+#             "('machine' <-> 'learning') & 'great' | ('data' <-> 'science') & 'rocks'"
+#         ),
+
+#         # Quoted phrases
+#         (
+#             '"quick brown fox"',
+#             "('quick' <-> 'brown' <-> 'fox')"
+#         ),
+#         (
+#             'Find "machine learning" algorithms',
+#             "('machine' <-> 'learning') & 'algorithms' & 'find'"
+#         ),
+
+#         # Multiple quoted phrases
+#         (
+#             '"data science" and "machine learning"',
+#             "('data' <-> 'science') & ('machine' <-> 'learning')"
+#         ),
+
+#         # Edge cases
+#         (
+#             "",
+#             ""
+#         ),
+#         (
+#             "the and or",
+#             ""
+#         ),
+#         (
+#             "a",
+#             ""
+#         ),
+#         (
+#             "X",
+#             "'x'"
+#         ),
+
+#         # Empty quotes
+#         (
+#             '""',
+#             ""
+#         ),
+#         (
+#             'test "" phrase',
+#             "'test' & 'phrase'"
+#         ),
+#     ]
+
+#     for input_text, expected_output in test_cases:
+#         result = text_to_tsvector_query(input_text)
+#         print(f"\nInput: '{input_text}'")
+#         print(f"Generated tsquery: '{result}'")
+#         print(f"Expected: '{expected_output}'")
+#         assert result == expected_output, \
+#             f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
+
+
+@test("query: search docs by embedding with different confidence levels")
 async def _(
     dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
 ):
     pool = await create_db_pool(dsn=dsn)
 
-    # Get query embedding by averaging the embeddings (list of floats)
+    # Create a test document with a different embedding
+    # different_embedding = [0.5] * EMBEDDING_SIZE  # Create different embedding values
+    # await pool.execute(
+    #     """
+    #     INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+    #     VALUES ($1, $2, 0, 1, $3, $4)
+    #     """,  # Changed chunk_seq from 0 to 1
+    #     developer.id,
+    #     doc.id,
+    #     "Different test content",
+    #     f"[{', '.join([str(x) for x in different_embedding])}]",
+    # )
+
+    # Get query embedding (using original doc's embedding)
     query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)]
 
-    # Search using the correct parameter types
-    result = await search_docs_hybrid(
-        developer_id=developer.id,
-        owners=[("agent", agent.id)],
-        text_query=doc.content[0] if isinstance(doc.content, list) else doc.content,
-        embedding=query_embedding,
-        k=3,  # Add k parameter
-        metadata_filter={"test": "test"},  # Add metadata filter
-        connection_pool=pool,
-    )
-
-    assert len(result) >= 1
-    assert result[0].metadata is not None
+    # Test with different confidence levels
+    confidence_tests = [
+        (0.99, 0),  # High confidence should find no results
+        (0.7, 1),  # Medium confidence should find some results
+        (0.5, 2),  # Lower confidence should find more results
+        (0.1, 2),  # Very low confidence should find all results
+    ]
+
+    for confidence, expected_min_results in confidence_tests:
+        results = await search_docs_by_embedding(
+            developer_id=developer.id,
+            owners=[("agent", agent.id)],
+            embedding=query_embedding,
+            k=3,
+            confidence=confidence,
+            metadata_filter={"test": "test"},
+            connection_pool=pool,
+        )
+
+        print(f"\nSearch results with confidence {confidence}:")
+        for r in results:
+            print(f"- Doc ID: {r.id}, Distance: {r.distance}")
+
+        assert len(results) >= expected_min_results, (
+            f"Expected at least {expected_min_results} results with confidence {confidence}, got {len(results)}"
+        )
+
+        if results:
+            # Verify that all returned results meet the confidence threshold
+            for result in results:
+                assert result.distance >= confidence, (
+                    f"Result distance {result.distance} is below confidence threshold {confidence}"
+                )

From 67fc92d1356ae519ecb7d3d0f38cb3ddcfdf80be Mon Sep 17 00:00:00 2001
From: Dmitry Paramonov <asmatic075@gmail.com>
Date: Mon, 13 Jan 2025 13:03:05 +0300
Subject: [PATCH 05/27] fix: Remove unused function as the conversion is done
 by postgres query

---
 agents-api/agents_api/common/nlp.py | 68 ++++++++++++++---------------
 1 file changed, 33 insertions(+), 35 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 8a640a535..09fc456a3 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -296,40 +296,40 @@ def union(u: str, v: str) -> None:
 #    return results
 
 
-def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str:
-    """
-    Builds a PostgreSQL tsquery string from groups of keywords.
+# def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str:
+#     """
+#     Builds a PostgreSQL tsquery string from groups of keywords.
 
-    Args:
-        groups (list[set[str]]): List of keyword groups
-        proximity_n (int): Maximum distance between words for proximity search
+#     Args:
+#         groups (list[set[str]]): List of keyword groups
+#         proximity_n (int): Maximum distance between words for proximity search
 
-    Returns:
-        str: PostgreSQL tsquery compatible string
-    """
-    if not groups:
-        return ""
+#     Returns:
+#         str: PostgreSQL tsquery compatible string
+#     """
+#     if not groups:
+#         return ""
 
-    query_parts = []
+#     query_parts = []
 
-    for group in groups:
-        if not group:  # Skip empty groups
-            continue
+#     for group in groups:
+#         if not group:  # Skip empty groups
+#             continue
 
-        if len(group) == 1:
-            # Single word - just wrap in quotes
-            word = next(iter(group))
-            # No need to check for stopwords since they should be filtered earlier
-            query_parts.append(f"'{word.lower()}'")
-        else:
-            # Multiple words - sort by length (descending) and connect with <->
-            sorted_words = sorted(group, key=len, reverse=True)
-            filtered_words = [word.lower() for word in sorted_words]
-            if filtered_words:
-                phrase = " <-> ".join(f"'{word}'" for word in filtered_words)
-                query_parts.append(f"({phrase})")
+#         if len(group) == 1:
+#             # Single word - just wrap in quotes
+#             word = next(iter(group))
+#             # No need to check for stopwords since they should be filtered earlier
+#             query_parts.append(f"'{word.lower()}'")
+#         else:
+#             # Multiple words - sort by length (descending) and connect with <->
+#             sorted_words = sorted(group, key=len, reverse=True)
+#             filtered_words = [word.lower() for word in sorted_words]
+#             if filtered_words:
+#                 phrase = " <-> ".join(f"'{word}'" for word in filtered_words)
+#                 query_parts.append(f"({phrase})")
 
-    return " & ".join(query_parts) if query_parts else ""
+#     return " & ".join(query_parts) if query_parts else ""
 
 
 @lru_cache(maxsize=1000)
@@ -376,10 +376,8 @@ def text_to_tsvector_query(
 
         # Find proximity groups and build query
         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-        query = build_ts_query(groups, proximity_n)
-
-        if query:
-            queries.append(query)
+        if groups:
+            queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups]))
 
     return queries
 
@@ -414,9 +412,9 @@ def batch_text_to_tsvector_queries(
             if not keyword_positions:
                 continue
             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-            query = build_ts_query(groups, proximity_n)
-            if query:
-                queries.append(query)
+            if groups:
+                queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups]))
+
         results.append(queries)
 
     return results

From d21f9805deab853fa1cf51997c9f2a549a807e48 Mon Sep 17 00:00:00 2001
From: whiterabbit1983 <whiterabbit1983@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:04:04 +0000
Subject: [PATCH 06/27] refactor: Lint agents-api (CI)

---
 agents-api/agents_api/common/nlp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 09fc456a3..f33657dec 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -377,7 +377,7 @@ def text_to_tsvector_query(
         # Find proximity groups and build query
         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
         if groups:
-            queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups]))
+            queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups]))
 
     return queries
 
@@ -413,7 +413,7 @@ def batch_text_to_tsvector_queries(
                 continue
             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
             if groups:
-                queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups]))
+                queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups]))
 
         results.append(queries)
 

From 8b19c96af0293142d79483adb7e8b8640deae889 Mon Sep 17 00:00:00 2001
From: Dmitry Paramonov <asmatic075@gmail.com>
Date: Mon, 13 Jan 2025 13:03:05 +0300
Subject: [PATCH 07/27] fix: Remove unused function as the conversion is done
 by postgres query

---
 agents-api/agents_api/common/nlp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index f33657dec..705b58e79 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -377,7 +377,7 @@ def text_to_tsvector_query(
         # Find proximity groups and build query
         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
         if groups:
-            queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups]))
+            queries.append(" OR ".join([" OR ".join(grp) for grp in groups]))
 
     return queries
 
@@ -413,7 +413,7 @@ def batch_text_to_tsvector_queries(
                 continue
             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
             if groups:
-                queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups]))
+                queries.append(" OR ".join([" OR ".join(grp) for grp in groups]))
 
         results.append(queries)
 

From 70a78b3be65a797de6814a3a648676ace6010413 Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Mon, 13 Jan 2025 17:51:19 -0500
Subject: [PATCH 08/27] fix(agents-api): fixed nlp pipeline for FTS

---
 agents-api/agents_api/common/nlp.py | 236 +++++++---------------------
 1 file changed, 54 insertions(+), 182 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 705b58e79..233517aa8 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -94,14 +94,22 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]
 
     # Extract and filter spans in a single pass
     ent_spans = [ent for ent in doc.ents if ent.label_ not in excluded_labels]
-    chunk_spans = [chunk for chunk in doc.noun_chunks if not chunk.root.is_stop]
+    # Add more comprehensive stopword filtering for noun chunks
+    chunk_spans = [
+        chunk for chunk in doc.noun_chunks 
+        if not chunk.root.is_stop and not all(token.is_stop for token in chunk)
+    ]
     all_spans = filter_spans(ent_spans + chunk_spans)
 
-    # Process spans efficiently
+    # Process spans efficiently and filter out spans that are entirely stopwords
     keywords = []
     seen_texts = set()
 
     for span in all_spans:
+        # Skip if all tokens in span are stopwords
+        if all(token.is_stop for token in span):
+            continue
+            
         text = span.text.strip()
         lower_text = text.lower()
 
@@ -180,206 +188,61 @@ def union(u: str, v: str) -> None:
     return list(groups.values())
 
 
-# def build_query_pattern(group_size: int, n: int) -> str:
-#     """Cache query patterns for common group sizes."""
-#     if group_size == 1:
-#         return '"{}"'
-#     return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")"
-
-
-# def build_query(groups: list[set[str]], n: int = 10) -> str:
-#     """Build query with cached patterns."""
-#     clauses = []
-
-#     for group in groups:
-#         if len(group) == 1:
-#             clauses.append(f'"{next(iter(group))}"')
-#         else:
-#             # Sort by length descending to prioritize longer phrases
-#             sorted_group = sorted(group, key=len, reverse=True)
-#             # Get cached pattern and format with keywords
-#             pattern = build_query_pattern(len(group), n)
-#             clause = pattern.format(*sorted_group)
-#             clauses.append(clause)
-
-#     return " OR ".join(clauses)
-
-
-# @lru_cache(maxsize=100)
-# def paragraph_to_custom_queries(
-#     paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1
-# ) -> list[str]:
-#     """
-#     Optimized paragraph processing with minimal behavior changes.
-#     Added min_keywords parameter to filter out low-value queries.
-
-#     Args:
-#         paragraph (str): The input paragraph to convert.
-#         top_n (int): Number of top keywords to extract per sentence.
-#         proximity_n (int): The proximity window for NEAR/n.
-#         min_keywords (int): Minimum number of keywords required to form a query.
-
-#     Returns:
-#         list[str]: The list of custom query strings.
-#     """
-#     if not paragraph or not paragraph.strip():
-#         return []
-
-#     # Process entire paragraph once
-#     doc = nlp(paragraph)
-#     queries = []
-
-#     # Process sentences
-#     for sent in doc.sents:
-#         # Convert to doc for consistent API
-#         sent_doc = sent.as_doc()
-
-#         # Extract and clean keywords
-#         keywords = extract_keywords(sent_doc, top_n)
-#         if len(keywords) < min_keywords:
-#             continue
-
-#         # Find keyword positions using matcher
-#         keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
-
-#         # Skip if no keywords found in positions
-#         if not keyword_positions:
-#             continue
-
-#         # Find proximity groups and build query
-#         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-#         query = build_query(groups, proximity_n)
-
-#         if query:
-#             queries.append(query)
-
-#     return queries
-
-
-# def batch_paragraphs_to_custom_queries(
-#     paragraphs: list[str],
-#     top_n: int = 10,
-#     proximity_n: int = 10,
-#     min_keywords: int = 1,
-#     n_process: int = 1,
-# ) -> list[list[str]]:
-#     """
-#     Processes multiple paragraphs using nlp.pipe for better performance.
-
-#     Args:
-#         paragraphs (list[str]): list of paragraphs to process.
-#         top_n (int): Number of top keywords to extract per sentence.
-#         proximity_n (int): The proximity window for NEAR/n.
-#         min_keywords (int): Minimum number of keywords required to form a query.
-#         n_process (int): Number of processes to use for multiprocessing.
-
-#     Returns:
-#         list[list[str]]: A list where each element is a list of queries for a paragraph.
-#     """
-#     results = []
-#     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
-#         queries = []
-#         for sent in doc.sents:
-#             sent_doc = sent.as_doc()
-#             keywords = extract_keywords(sent_doc, top_n)
-#             if len(keywords) < min_keywords:
-#                 continue
-#             keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
-#             if not keyword_positions:
-#                 continue
-#             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-#             query = build_query(groups, proximity_n)
-#             if query:
-#                 queries.append(query)
-#         results.append(queries)
-
-#    return results
-
-
-# def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str:
-#     """
-#     Builds a PostgreSQL tsquery string from groups of keywords.
-
-#     Args:
-#         groups (list[set[str]]): List of keyword groups
-#         proximity_n (int): Maximum distance between words for proximity search
-
-#     Returns:
-#         str: PostgreSQL tsquery compatible string
-#     """
-#     if not groups:
-#         return ""
-
-#     query_parts = []
-
-#     for group in groups:
-#         if not group:  # Skip empty groups
-#             continue
-
-#         if len(group) == 1:
-#             # Single word - just wrap in quotes
-#             word = next(iter(group))
-#             # No need to check for stopwords since they should be filtered earlier
-#             query_parts.append(f"'{word.lower()}'")
-#         else:
-#             # Multiple words - sort by length (descending) and connect with <->
-#             sorted_words = sorted(group, key=len, reverse=True)
-#             filtered_words = [word.lower() for word in sorted_words]
-#             if filtered_words:
-#                 phrase = " <-> ".join(f"'{word}'" for word in filtered_words)
-#                 query_parts.append(f"({phrase})")
-
-#     return " & ".join(query_parts) if query_parts else ""
-
-
 @lru_cache(maxsize=1000)
 def text_to_tsvector_query(
     paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1
 ) -> str:
     """
-    Optimized paragraph processing with minimal behavior changes.
-    Added min_keywords parameter to filter out low-value queries.
+    Extracts meaningful keywords/phrases from text and joins them with OR.
+    
+    Example:
+        Input: "I like basketball especially Michael Jordan"
+        Output: "basketball OR Michael Jordan"
 
     Args:
-        paragraph (str): The input paragraph to convert.
-        top_n (int): Number of top keywords to extract per sentence.
-        proximity_n (int): The proximity window for NEAR/n.
-        min_keywords (int): Minimum number of keywords required to form a query.
-
+        paragraph (str): The input text to process
+        top_n (int): Number of top keywords to extract per sentence
+        proximity_n (int): The proximity window for grouping related keywords
+        min_keywords (int): Minimum number of keywords required
 
     Returns:
-        str: PostgreSQL tsquery compatible string
+        str: Keywords/phrases joined by OR
     """
     if not paragraph or not paragraph.strip():
-        return []
+        return ""
 
-    # Process entire paragraph once
     doc = nlp(paragraph)
-    queries = []
+    queries = set()  # Use set to avoid duplicates
 
-    # Process sentences
     for sent in doc.sents:
-        # Convert to doc for consistent API
         sent_doc = sent.as_doc()
-
-        # Extract and clean keywords
+        
+        # Extract keywords
         keywords = extract_keywords(sent_doc, top_n)
         if len(keywords) < min_keywords:
             continue
 
-        # Find keyword positions using matcher
+        # Find keyword positions
         keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
-
-        # Skip if no keywords found in positions
         if not keyword_positions:
             continue
 
-        # Find proximity groups and build query
+        # Group related keywords by proximity
         groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-        if groups:
-            queries.append(" OR ".join([" OR ".join(grp) for grp in groups]))
 
-    return queries
+        # Add each group as a single term to our set
+        for group in groups:
+            if len(group) > 1:
+                # Sort by length descending to prioritize longer phrases
+                sorted_group = sorted(group, key=len, reverse=True)
+                # For truly proximate multi-word groups, group words 
+                queries.add(" OR ".join(sorted_group))
+            else:
+                # For non-proximate words or single words, add them separately
+                queries.update(group)
+
+    # Join all terms with " OR "
+    return " OR ".join(queries) if queries else ""
 
 
 def batch_text_to_tsvector_queries(
@@ -388,7 +251,7 @@ def batch_text_to_tsvector_queries(
     proximity_n: int = 10,
     min_keywords: int = 1,
     n_process: int = 1,
-) -> list[list[str]]:
+) -> list[str]:
     """
     Processes multiple paragraphs using nlp.pipe for better performance.
 
@@ -402,7 +265,7 @@ def batch_text_to_tsvector_queries(
     results = []
 
     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
-        queries = []
+        queries = set() # Use set to avoid duplicates
         for sent in doc.sents:
             sent_doc = sent.as_doc()
             keywords = extract_keywords(sent_doc, top_n)
@@ -412,9 +275,18 @@ def batch_text_to_tsvector_queries(
             if not keyword_positions:
                 continue
             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-            if groups:
-                queries.append(" OR ".join([" OR ".join(grp) for grp in groups]))
-
-        results.append(queries)
+            # Add each group as a single term to our set
+            for group in groups:
+                if len(group) > 1:
+                    # Sort by length descending to prioritize longer phrases
+                    sorted_group = sorted(group, key=len, reverse=True)
+                    # For truly proximate multi-word groups, group words 
+                    queries.add(" OR ".join(sorted_group))
+                else:
+                    # For non-proximate words or single words, add them separately
+                    queries.update(group)
+
+        # Join all terms with " OR "
+        results.append(" OR ".join(queries) if queries else "")
 
     return results

From ab8e3b782c0cfb068c9e9320e6015613a11b4647 Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Mon, 13 Jan 2025 17:52:45 -0500
Subject: [PATCH 09/27] chore(tests): added test for the nlp utility + FTS
 search

---
 agents-api/tests/test_docs_queries.py | 994 ++++++++++++--------------
 1 file changed, 469 insertions(+), 525 deletions(-)

diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index 70f13a129..fd82d7396 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -1,253 +1,366 @@
+from agents_api.autogen.openapi_model import CreateDocRequest
 from agents_api.clients.pg import create_db_pool
+from agents_api.queries.docs.create_doc import create_doc
+from agents_api.queries.docs.delete_doc import delete_doc
+from agents_api.queries.docs.get_doc import get_doc
+from agents_api.queries.docs.list_docs import list_docs
 from agents_api.queries.docs.search_docs_by_embedding import search_docs_by_embedding
+from agents_api.queries.docs.search_docs_by_text import search_docs_by_text
+from agents_api.queries.docs.search_docs_hybrid import search_docs_hybrid
 from ward import test
 
+from agents_api.common.nlp import text_to_tsvector_query
+
 from .fixtures import (
     pg_dsn,
     test_agent,
     test_developer,
+    test_doc,
     test_doc_with_embedding,
+    test_user,
 )
 
 EMBEDDING_SIZE: int = 1024
 
+import math
+
+def make_vector_with_similarity(n: int, d: float):
+    """
+    Returns a list `v` of length `n` such that the cosine similarity
+    between `v` and the all-ones vector of length `n` is approximately d.
+    """
+    if not -1.0 <= d <= 1.0:
+        raise ValueError("d must lie in [-1, 1].")
+    
+    # Handle special cases exactly:
+    if abs(d - 1.0) < 1e-12:  # d ~ +1
+        return [1.0] * n
+    if abs(d + 1.0) < 1e-12:  # d ~ -1
+        return [-1.0] * n
+    if abs(d) < 1e-12:        # d ~ 0
+        v = [0.0]*n
+        if n >= 2:
+            v[0] = 1.0
+            v[1] = -1.0
+        return v
+
+    sign_d = 1.0 if d >= 0 else -1.0
+    
+    # Base part: sign(d)*[1,1,...,1]
+    base = [sign_d]*n
+    
+    # Orthogonal unit vector u with sum(u)=0; for simplicity:
+    #   u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0]
+    u = [0.0]*n
+    if n >= 2:
+        u[0] = 1.0 / math.sqrt(2)
+        u[1] = -1.0 / math.sqrt(2)
+    # (if n=1, there's no truly orthogonal vector to [1], so skip)
+
+    # Solve for alpha:
+    # alpha^2 = n*(1 - d^2)/d^2
+    alpha = math.sqrt(n*(1 - d*d)) / abs(d)
+
+    # Construct v
+    v = [0.0]*n
+    for i in range(n):
+        v[i] = base[i] + alpha * u[i]
+    
+    return v
+
+
+@test("query: create user doc")
+async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
+    pool = await create_db_pool(dsn=dsn)
+    doc_created = await create_doc(
+        developer_id=developer.id,
+        data=CreateDocRequest(
+            title="User Doc",
+            content=["Docs for user testing", "Docs for user testing 2"],
+            metadata={"test": "test"},
+            embed_instruction="Embed the document",
+        ),
+        owner_type="user",
+        owner_id=user.id,
+        connection_pool=pool,
+    )
 
-# @test("query: create user doc")
-# async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
-#     pool = await create_db_pool(dsn=dsn)
-#     doc_created = await create_doc(
-#         developer_id=developer.id,
-#         data=CreateDocRequest(
-#             title="User Doc",
-#             content=["Docs for user testing", "Docs for user testing 2"],
-#             metadata={"test": "test"},
-#             embed_instruction="Embed the document",
-#         ),
-#         owner_type="user",
-#         owner_id=user.id,
-#         connection_pool=pool,
-#     )
-
-#     assert doc_created.id is not None
-
-#     # Verify doc appears in user's docs
-#     found = await get_doc(
-#         developer_id=developer.id,
-#         doc_id=doc_created.id,
-#         connection_pool=pool,
-#     )
-#     assert found.id == doc_created.id
-
-
-# @test("query: create agent doc")
-# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
-#     pool = await create_db_pool(dsn=dsn)
-#     doc = await create_doc(
-#         developer_id=developer.id,
-#         data=CreateDocRequest(
-#             title="Agent Doc",
-#             content="Docs for agent testing",
-#             metadata={"test": "test"},
-#             embed_instruction="Embed the document",
-#         ),
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         connection_pool=pool,
-#     )
-#     assert doc.id is not None
-
-#     # Verify doc appears in agent's docs
-#     docs_list = await list_docs(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         connection_pool=pool,
-#     )
-#     assert any(d.id == doc.id for d in docs_list)
-
-
-# @test("query: get doc")
-# async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc):
-#     pool = await create_db_pool(dsn=dsn)
-#     doc_test = await get_doc(
-#         developer_id=developer.id,
-#         doc_id=doc.id,
-#         connection_pool=pool,
-#     )
-#     assert doc_test.id == doc.id
-#     assert doc_test.title is not None
-#     assert doc_test.content is not None
-
-
-# @test("query: list user docs")
-# async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
-#     pool = await create_db_pool(dsn=dsn)
+    assert doc_created.id is not None
 
-#     # Create a doc owned by the user
-#     doc_user = await create_doc(
-#         developer_id=developer.id,
-#         data=CreateDocRequest(
-#             title="User List Test",
-#             content="Some user doc content",
-#             metadata={"test": "test"},
-#             embed_instruction="Embed the document",
-#         ),
-#         owner_type="user",
-#         owner_id=user.id,
-#         connection_pool=pool,
-#     )
-
-#     # List user's docs
-#     docs_list = await list_docs(
-#         developer_id=developer.id,
-#         owner_type="user",
-#         owner_id=user.id,
-#         connection_pool=pool,
-#     )
-#     assert len(docs_list) >= 1
-#     assert any(d.id == doc_user.id for d in docs_list)
-
-
-# @test("query: list agent docs")
-# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
-#     pool = await create_db_pool(dsn=dsn)
+    # Verify doc appears in user's docs
+    found = await get_doc(
+        developer_id=developer.id,
+        doc_id=doc_created.id,
+        connection_pool=pool,
+    )
+    assert found.id == doc_created.id
 
-#     # Create a doc owned by the agent
-#     doc_agent = await create_doc(
-#         developer_id=developer.id,
-#         data=CreateDocRequest(
-#             title="Agent List Test",
-#             content="Some agent doc content",
-#             metadata={"test": "test"},
-#             embed_instruction="Embed the document",
-#         ),
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         connection_pool=pool,
-#     )
-
-#     # List agent's docs
-#     docs_list = await list_docs(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         connection_pool=pool,
-#     )
-#     assert len(docs_list) >= 1
-#     assert any(d.id == doc_agent.id for d in docs_list)
-
-
-# @test("query: delete user doc")
-# async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
-#     pool = await create_db_pool(dsn=dsn)
 
-#     # Create a doc owned by the user
-#     doc_user = await create_doc(
-#         developer_id=developer.id,
-#         data=CreateDocRequest(
-#             title="User Delete Test",
-#             content="Doc for user deletion test",
-#             metadata={"test": "test"},
-#             embed_instruction="Embed the document",
-#         ),
-#         owner_type="user",
-#         owner_id=user.id,
-#         connection_pool=pool,
-#     )
-
-#     # Delete the doc
-#     await delete_doc(
-#         developer_id=developer.id,
-#         doc_id=doc_user.id,
-#         owner_type="user",
-#         owner_id=user.id,
-#         connection_pool=pool,
-#     )
-
-#     # Verify doc is no longer in user's docs
-#     docs_list = await list_docs(
-#         developer_id=developer.id,
-#         owner_type="user",
-#         owner_id=user.id,
-#         connection_pool=pool,
-#     )
-#     assert not any(d.id == doc_user.id for d in docs_list)
-
-
-# @test("query: delete agent doc")
-# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
-#     pool = await create_db_pool(dsn=dsn)
+@test("query: create agent doc")
+async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+    pool = await create_db_pool(dsn=dsn)
+    doc = await create_doc(
+        developer_id=developer.id,
+        data=CreateDocRequest(
+            title="Agent Doc",
+            content="Docs for agent testing",
+            metadata={"test": "test"},
+            embed_instruction="Embed the document",
+        ),
+        owner_type="agent",
+        owner_id=agent.id,
+        connection_pool=pool,
+    )
+    assert doc.id is not None
+
+    # Verify doc appears in agent's docs
+    docs_list = await list_docs(
+        developer_id=developer.id,
+        owner_type="agent",
+        owner_id=agent.id,
+        connection_pool=pool,
+    )
+    assert any(d.id == doc.id for d in docs_list)
 
-#     # Create a doc owned by the agent
-#     doc_agent = await create_doc(
-#         developer_id=developer.id,
-#         data=CreateDocRequest(
-#             title="Agent Delete Test",
-#             content="Doc for agent deletion test",
-#             metadata={"test": "test"},
-#             embed_instruction="Embed the document",
-#         ),
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         connection_pool=pool,
-#     )
-
-#     # Delete the doc
-#     await delete_doc(
-#         developer_id=developer.id,
-#         doc_id=doc_agent.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         connection_pool=pool,
-#     )
-
-#     # Verify doc is no longer in agent's docs
-#     docs_list = await list_docs(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         connection_pool=pool,
-#     )
-#     assert not any(d.id == doc_agent.id for d in docs_list)
-
-
-# @test("query: search docs by text")
-# async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer):
-#     pool = await create_db_pool(dsn=dsn)
 
-#     # Create a test document
-#     doc = await create_doc(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         data=CreateDocRequest(
-#             title="Hello",
-#             content="The world is a funny little thing",
-#             metadata={"test": "test"},
-#             embed_instruction="Embed the document",
-#         ),
-#         connection_pool=pool,
-#     )
-
-#     # Search using simpler terms first
-#     result = await search_docs_by_text(
-#         developer_id=developer.id,
-#         owners=[("agent", agent.id)],
-#         query="world",
-#         k=3,
-#         search_language="english",
-#         metadata_filter={"test": "test"},
-#         connection_pool=pool,
-#     )
-
-#     print("\nSearch results:", result)
-
-#     # More specific assertions
-#     assert len(result) >= 1, "Should find at least one document"
-#     assert any(d.id == doc.id for d in result), f"Should find document {doc.id}"
-#     assert result[0].metadata == {"test": "test"}, "Metadata should match"
+@test("query: get doc")
+async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc):
+    pool = await create_db_pool(dsn=dsn)
+    doc_test = await get_doc(
+        developer_id=developer.id,
+        doc_id=doc.id,
+        connection_pool=pool,
+    )
+    assert doc_test.id == doc.id
+    assert doc_test.title is not None
+    assert doc_test.content is not None
 
 
+@test("query: list user docs")
+async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
+    pool = await create_db_pool(dsn=dsn)
+
+    # Create a doc owned by the user
+    doc_user = await create_doc(
+        developer_id=developer.id,
+        data=CreateDocRequest(
+            title="User List Test",
+            content="Some user doc content",
+            metadata={"test": "test"},
+            embed_instruction="Embed the document",
+        ),
+        owner_type="user",
+        owner_id=user.id,
+        connection_pool=pool,
+    )
+
+    # List user's docs
+    docs_list = await list_docs(
+        developer_id=developer.id,
+        owner_type="user",
+        owner_id=user.id,
+        connection_pool=pool,
+    )
+    assert len(docs_list) >= 1
+    assert any(d.id == doc_user.id for d in docs_list)
+
+
+@test("query: list agent docs")
+async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+    pool = await create_db_pool(dsn=dsn)
+
+    # Create a doc owned by the agent
+    doc_agent = await create_doc(
+        developer_id=developer.id,
+        data=CreateDocRequest(
+            title="Agent List Test",
+            content="Some agent doc content",
+            metadata={"test": "test"},
+            embed_instruction="Embed the document",
+        ),
+        owner_type="agent",
+        owner_id=agent.id,
+        connection_pool=pool,
+    )
+
+    # List agent's docs
+    docs_list = await list_docs(
+        developer_id=developer.id,
+        owner_type="agent",
+        owner_id=agent.id,
+        connection_pool=pool,
+    )
+    assert len(docs_list) >= 1
+    assert any(d.id == doc_agent.id for d in docs_list)
+
+
+@test("query: delete user doc")
+async def _(dsn=pg_dsn, developer=test_developer, user=test_user):
+    pool = await create_db_pool(dsn=dsn)
+
+    # Create a doc owned by the user
+    doc_user = await create_doc(
+        developer_id=developer.id,
+        data=CreateDocRequest(
+            title="User Delete Test",
+            content="Doc for user deletion test",
+            metadata={"test": "test"},
+            embed_instruction="Embed the document",
+        ),
+        owner_type="user",
+        owner_id=user.id,
+        connection_pool=pool,
+    )
+
+    # Delete the doc
+    await delete_doc(
+        developer_id=developer.id,
+        doc_id=doc_user.id,
+        owner_type="user",
+        owner_id=user.id,
+        connection_pool=pool,
+    )
+
+    # Verify doc is no longer in user's docs
+    docs_list = await list_docs(
+        developer_id=developer.id,
+        owner_type="user",
+        owner_id=user.id,
+        connection_pool=pool,
+    )
+    assert not any(d.id == doc_user.id for d in docs_list)
+
+
+@test("query: delete agent doc")
+async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+    pool = await create_db_pool(dsn=dsn)
+
+    # Create a doc owned by the agent
+    doc_agent = await create_doc(
+        developer_id=developer.id,
+        data=CreateDocRequest(
+            title="Agent Delete Test",
+            content="Doc for agent deletion test",
+            metadata={"test": "test"},
+            embed_instruction="Embed the document",
+        ),
+        owner_type="agent",
+        owner_id=agent.id,
+        connection_pool=pool,
+    )
+
+    # Delete the doc
+    await delete_doc(
+        developer_id=developer.id,
+        doc_id=doc_agent.id,
+        owner_type="agent",
+        owner_id=agent.id,
+        connection_pool=pool,
+    )
+
+    # Verify doc is no longer in agent's docs
+    docs_list = await list_docs(
+        developer_id=developer.id,
+        owner_type="agent",
+        owner_id=agent.id,
+        connection_pool=pool,
+    )
+    assert not any(d.id == doc_agent.id for d in docs_list)
+
+
+@test("query: search docs by text")
+async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer):
+    pool = await create_db_pool(dsn=dsn)
+
+    # Create a test document
+    doc = await create_doc(
+        developer_id=developer.id,
+        owner_type="agent",
+        owner_id=agent.id,
+        data=CreateDocRequest(
+            title="Hello",
+            content="The world is a funny little thing",
+            metadata={"test": "test"},
+            embed_instruction="Embed the document",
+        ),
+        connection_pool=pool,
+    )
+
+    # Search using simpler terms first
+    result = await search_docs_by_text(
+        developer_id=developer.id,
+        owners=[("agent", agent.id)],
+        query="world",
+        k=3,
+        search_language="english",
+        metadata_filter={"test": "test"},
+        connection_pool=pool,
+    )
+
+    print("\nSearch results:", result)
+
+    # More specific assertions
+    assert len(result) >= 1, "Should find at least one document"
+    assert any(d.id == doc.id for d in result), f"Should find document {doc.id}"
+    assert result[0].metadata == {"test": "test"}, "Metadata should match"
+
+@test("query: search docs by text with technical terms and phrases")
+async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+    pool = await create_db_pool(dsn=dsn)
+
+    # Create documents with technical content
+    doc1 = await create_doc(
+        developer_id=developer.id,
+        owner_type="agent",
+        owner_id=agent.id,
+        data=CreateDocRequest(
+            title="Technical Document",
+            content="API endpoints using REST architecture with JSON payloads",
+            metadata={"domain": "technical"},
+            embed_instruction="Embed the document",
+        ),
+        connection_pool=pool,
+    )
+
+    doc2 = await create_doc(
+        developer_id=developer.id,
+        owner_type="agent",
+        owner_id=agent.id,
+        data=CreateDocRequest(
+            title="More Technical Terms",
+            content="Database optimization using indexing and query planning",
+            metadata={"domain": "technical"},
+            embed_instruction="Embed the document",
+        ),
+        connection_pool=pool,
+    )
+
+    # Test with technical terms
+    technical_queries = [
+        "API endpoints",
+        "REST architecture",
+        "database optimization",
+        "indexing"
+    ]
+
+    for query in technical_queries:
+        results = await search_docs_by_text(
+            developer_id=developer.id,
+            owners=[("agent", agent.id)],
+            query=query,
+            k=3,
+            search_language="english",
+            connection_pool=pool,
+        )
+
+        print(f"\nSearch results for '{query}':", results)
+
+        # Verify appropriate document is found based on query
+        if "API" in query or "REST" in query:
+            assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'"
+        if "database" in query.lower() or "indexing" in query:
+            assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'"
+
 @test("query: search docs by embedding")
 async def _(
     dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
@@ -273,318 +386,149 @@ async def _(
     assert result[0].metadata is not None
 
 
-# @test("query: search docs by hybrid")
-# async def _(
-#     dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
-# ):
-#     pool = await create_db_pool(dsn=dsn)
-
-#     # Get query embedding by averaging the embeddings (list of floats)
-#     query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)]
-
-#     # Search using the correct parameter types
-#     result = await search_docs_hybrid(
-#         developer_id=developer.id,
-#         owners=[("agent", agent.id)],
-#         text_query=doc.content[0] if isinstance(doc.content, list) else doc.content,
-#         embedding=query_embedding,
-#         k=3,  # Add k parameter
-#         metadata_filter={"test": "test"},  # Add metadata filter
-#         connection_pool=pool,
-#     )
-
-#     assert len(result) >= 1
-#     assert result[0].metadata is not None
+@test("query: search docs by hybrid")
+async def _(
+    dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
+):
+    pool = await create_db_pool(dsn=dsn)
 
+    # Get query embedding by averaging the embeddings (list of floats)
+    query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)]
 
-# @test("query: test tsvector with technical terms and phrases")
-# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
-#     pool = await create_db_pool(dsn=dsn)
+    # Search using the correct parameter types
+    result = await search_docs_hybrid(
+        developer_id=developer.id,
+        owners=[("agent", agent.id)],
+        text_query=doc.content[0] if isinstance(doc.content, list) else doc.content,
+        embedding=query_embedding,
+        k=3,  # Add k parameter
+        metadata_filter={"test": "test"},  # Add metadata filter
+        connection_pool=pool,
+    )
 
-#     # Create documents with technical content
-#     doc1 = await create_doc(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         data=CreateDocRequest(
-#             title="Technical Document",
-#             content="API endpoints using REST architecture with JSON payloads",
-#             metadata={"domain": "technical"},
-#             embed_instruction="Embed the document",
-#         ),
-#         connection_pool=pool,
-#     )
-
-#     doc2 = await create_doc(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         data=CreateDocRequest(
-#             title="More Technical Terms",
-#             content="Database optimization using indexing and query planning",
-#             metadata={"domain": "technical"},
-#             embed_instruction="Embed the document",
-#         ),
-#         connection_pool=pool,
-#     )
-
-#     # Test with technical terms
-#     technical_queries = [
-#         "API endpoints",
-#         "REST architecture",
-#         "database optimization",
-#         "indexing"
-#     ]
+    assert len(result) >= 1
+    assert result[0].metadata is not None
 
-#     for query in technical_queries:
-#         results = await search_docs_by_text(
-#             developer_id=developer.id,
-#             owners=[("agent", agent.id)],
-#             query=query,
-#             k=3,
-#             search_language="english",
-#             connection_pool=pool,
-#         )
+@test("utility: test text_to_tsvector_query")
+async def _():
+    test_cases = [
+        # Single words
+        (
+            "test",
+            "test"
+        ),
+
+        # Multiple words in single sentence
+        (
+            "quick brown fox",
+            "quick brown fox"  # Now kept as a single phrase due to proximity
+        ),
+
+        # Technical terms and phrases
+        (
+            "Machine Learning algorithm",
+            "machine learning algorithm"  # Common technical phrase
+        ),
+        # Multiple sentences
+        (
+            "Machine learning is great. Data science rocks.",
+            "machine learning OR data science rocks"
+        ),
+
+        # Quoted phrases
+        (
+            '"quick brown fox"',
+            "quick brown fox"  # Quotes removed, phrase kept together
+        ),
+        (
+            'Find "machine learning" algorithms',
+            "machine learning"
+        ),
+
+        # Multiple quoted phrases
+        (
+            '"data science" and "machine learning"',
+            "machine learning OR data science"
+        ),
+
+        # Edge cases
+        (
+            "",
+            ""
+        ),
+        (
+            "the and or",
+            ""  # All stop words should result in empty string
+        ),
+        (
+            "a",
+            ""  # Single stop word should result in empty string
+        ),
+        (
+            "X",
+            "X"
+        ),
+
+        # Empty quotes
+        (
+            '""',
+            ""
+        ),
+        (
+            'test "" phrase',
+            "phrase OR test"
+        ),
+    ]
 
-#         print(f"\nSearch results for '{query}':", results)
+    for input_text, expected_output in test_cases:
+        print(f"Input: '{input_text}'")
+        result = text_to_tsvector_query(input_text)
+        print(f"Generated query: '{result}'")
+        print(f"Expected: '{expected_output}'\n")
+        assert result.lower() == expected_output.lower(), \
+            f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
 
-#         # Verify appropriate document is found based on query
-#         if "API" in query or "REST" in query:
-#             assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'"
-#         if "database" in query.lower() or "indexing" in query:
-#             assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'"
 
-# @test("query: test tsvector with varying content lengths and special characters")
-# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
+# @test("query: search docs by embedding with different confidence levels")
+# async def _(
+#     dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
+# ):
 #     pool = await create_db_pool(dsn=dsn)
 
-#     # Create documents with different content lengths
-#     short_doc = await create_doc(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         data=CreateDocRequest(
-#             title="Short",
-#             content="Brief test document",
-#             metadata={"length": "short"},
-#             embed_instruction="Embed the document",
-#         ),
-#         connection_pool=pool,
-#     )
-
-#     medium_doc = await create_doc(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         data=CreateDocRequest(
-#             title="Medium",
-#             content="This is a medium length document that contains more words and context for testing purposes",
-#             metadata={"length": "medium"},
-#             embed_instruction="Embed the document",
-#         ),
-#         connection_pool=pool,
-#     )
-
-# long_doc = await create_doc(
-#     developer_id=developer.id,
-#     owner_type="agent",
-#     owner_id=agent.id,
-#     data=CreateDocRequest(
-#         title="Long",
-#         content="This is a much longer document that contains multiple sentences. It includes various terms and phrases. \
-#         The purpose is to test how the search handles longer content with more context. \
-#         It should still be able to find relevant matches based on the search query.",
-#         metadata={"length": "long"},
-#         embed_instruction="Embed the document",
-#     ),
-#     connection_pool=pool,
-# )
-
-#     special_doc = await create_doc(
-#         developer_id=developer.id,
-#         owner_type="agent",
-#         owner_id=agent.id,
-#         data=CreateDocRequest(
-#             title="Special Characters",
-#             content="Testing! With? Different... punctuation; marks: and-hyphens, plus+signs & ampersands",
-#             metadata={"type": "special"},
-#             embed_instruction="Embed the document",
-#         ),
-#         connection_pool=pool,
-#     )
-
-#     # Test cases for different content lengths
-#     length_test_cases = [
-#         ("brief test", short_doc.id),
-#         ("medium length document", medium_doc.id),
-#         ("multiple sentences", long_doc.id),
-#         ("document", None)  # Should find all documents
+#     # Get query embedding (using original doc's embedding)
+#     query_embedding = make_vector_with_similarity(EMBEDDING_SIZE, 0.7)
+
+#     # Test with different confidence levels
+#     confidence_tests = [
+#         (0.99, 0),  # Very high similarity threshold - should find no results
+#         (0.7, 1),   # High similarity - should find 1 result (the embedding with all 1.0s)
+#         (0.3, 2),   # Medium similarity - should find 2 results (including 0.3-0.7 embedding)
+#         (-0.8, 3),  # Low similarity - should find 3 results (including -0.8 to 0.8 embedding)
+#         (-1.0, 4)   # Lowest similarity - should find all 4 results (including alternating -1/1)
 #     ]
 
-#     for query, expected_doc_id in length_test_cases:
-#         results = await search_docs_by_text(
+#     for confidence, expected_min_results in confidence_tests:
+#         results = await search_docs_by_embedding(
 #             developer_id=developer.id,
 #             owners=[("agent", agent.id)],
-#             query=query,
+#             embedding=query_embedding,
 #             k=3,
-#             search_language="english",
+#             confidence=confidence,
+#             metadata_filter={"test": "test"},
 #             connection_pool=pool,
 #         )
 
-#         print(f"\nSearch results for '{query}':", results)
-
-#         if expected_doc_id:
-#             assert any(doc.id == expected_doc_id for doc in results), \
-#                 f"Expected document should be found with query '{query}'"
-#         else:
-#             # For general terms, verify multiple documents are found
-#             assert len(results) > 1, f"Multiple documents should be found with query '{query}'"
-
-# @test("query: test direct tsvector generation")
-# async def _():
-#     test_cases = [
-#         # Single words
-#         (
-#             "test",
-#             "'test'"
-#         ),
-#         (
-#             "testing",
-#             "'testing'"
-#         ),
-
-#         # Multiple words in single sentence
-#         (
-#             "quick brown fox",
-#             "'quick' & 'brown' & 'fox'"
-#         ),
-#         (
-#             "The Quick Brown Fox",
-#             "'quick' & 'brown' & 'fox'"
-#         ),
-
-#         # Technical terms and phrases
-#         (
-#             "machine learning algorithm",
-#             "('machine' <-> 'learning') & 'algorithm'"
-#         ),
-#         (
-#             "REST API implementation",
-#             "'rest' & 'api' & 'implementation'"
-#         ),
-
-#         # Multiple sentences
-#         (
-#             "Machine learning is great. Data science rocks.",
-#             "('machine' <-> 'learning') & 'great' | ('data' <-> 'science') & 'rocks'"
-#         ),
-
-#         # Quoted phrases
-#         (
-#             '"quick brown fox"',
-#             "('quick' <-> 'brown' <-> 'fox')"
-#         ),
-#         (
-#             'Find "machine learning" algorithms',
-#             "('machine' <-> 'learning') & 'algorithms' & 'find'"
-#         ),
-
-#         # Multiple quoted phrases
-#         (
-#             '"data science" and "machine learning"',
-#             "('data' <-> 'science') & ('machine' <-> 'learning')"
-#         ),
-
-#         # Edge cases
-#         (
-#             "",
-#             ""
-#         ),
-#         (
-#             "the and or",
-#             ""
-#         ),
-#         (
-#             "a",
-#             ""
-#         ),
-#         (
-#             "X",
-#             "'x'"
-#         ),
-
-#         # Empty quotes
-#         (
-#             '""',
-#             ""
-#         ),
-#         (
-#             'test "" phrase',
-#             "'test' & 'phrase'"
-#         ),
-#     ]
-
-#     for input_text, expected_output in test_cases:
-#         result = text_to_tsvector_query(input_text)
-#         print(f"\nInput: '{input_text}'")
-#         print(f"Generated tsquery: '{result}'")
-#         print(f"Expected: '{expected_output}'")
-#         assert result == expected_output, \
-#             f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
-
+#         print(f"\nSearch results with confidence {confidence}:")
+#         for r in results:
+#             print(f"- Doc ID: {r.id}, Distance: {r.distance}")
 
-@test("query: search docs by embedding with different confidence levels")
-async def _(
-    dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
-):
-    pool = await create_db_pool(dsn=dsn)
-
-    # Create a test document with a different embedding
-    # different_embedding = [0.5] * EMBEDDING_SIZE  # Create different embedding values
-    # await pool.execute(
-    #     """
-    #     INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
-    #     VALUES ($1, $2, 0, 1, $3, $4)
-    #     """,  # Changed chunk_seq from 0 to 1
-    #     developer.id,
-    #     doc.id,
-    #     "Different test content",
-    #     f"[{', '.join([str(x) for x in different_embedding])}]",
-    # )
-
-    # Get query embedding (using original doc's embedding)
-    query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)]
-
-    # Test with different confidence levels
-    confidence_tests = [
-        (0.99, 0),  # High confidence should find no results
-        (0.7, 1),  # Medium confidence should find some results
-        (0.5, 2),  # Lower confidence should find more results
-        (0.1, 2),  # Very low confidence should find all results
-    ]
-
-    for confidence, expected_min_results in confidence_tests:
-        results = await search_docs_by_embedding(
-            developer_id=developer.id,
-            owners=[("agent", agent.id)],
-            embedding=query_embedding,
-            k=3,
-            confidence=confidence,
-            metadata_filter={"test": "test"},
-            connection_pool=pool,
-        )
-
-        print(f"\nSearch results with confidence {confidence}:")
-        for r in results:
-            print(f"- Doc ID: {r.id}, Distance: {r.distance}")
-
-        assert len(results) >= expected_min_results, (
-            f"Expected at least {expected_min_results} results with confidence {confidence}, got {len(results)}"
-        )
+#         assert len(results) >= expected_min_results, (
+#             f"Expected at least {expected_min_results} results with confidence {confidence}, got {len(results)}"
+#         )
 
-        if results:
-            # Verify that all returned results meet the confidence threshold
-            for result in results:
-                assert result.distance >= confidence, (
-                    f"Result distance {result.distance} is below confidence threshold {confidence}"
-                )
+#         if results:
+#             # Verify that all returned results meet the confidence threshold
+#             for result in results:
+#                 assert result.distance >= confidence, (
+#                     f"Result distance {result.distance} is below confidence threshold {confidence}"
+#                 )

From 41ae093075e3279b78adc2fa03fe645f8c8e8077 Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Mon, 13 Jan 2025 17:53:29 -0500
Subject: [PATCH 10/27] chore: misc code refactor

---
 .../queries/docs/search_docs_by_text.py       |  3 ++
 agents-api/tests/fixtures.py                  | 31 +++++++++++++++++--
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py
index 77fb3a0e6..44e1bb731 100644
--- a/agents-api/agents_api/queries/docs/search_docs_by_text.py
+++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py
@@ -7,6 +7,7 @@
 from ...autogen.openapi_model import DocReference
 from ...common.utils.db_exceptions import common_db_exceptions
 from ..utils import pg_query, rewrap_exceptions, wrap_in_class
+from ...common.nlp import text_to_tsvector_query
 from .utils import transform_to_doc_reference
 
 # Raw query for text search
@@ -60,6 +61,8 @@ async def search_docs_by_text(
     # Extract owner types and IDs
     owner_types: list[str] = [owner[0] for owner in owners]
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
+    #  Pre-process rawtext query
+    # query = text_to_tsvector_query(query)
 
     return (
         search_docs_text_query,
diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py
index 166bbef73..43eb47b9a 100644
--- a/agents-api/tests/fixtures.py
+++ b/agents-api/tests/fixtures.py
@@ -175,15 +175,40 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         f"[{', '.join([str(x) for x in [1.0] * 1024])}]",
     )
 
+    # Insert embedding with random values between 0.3 and 0.7
     await pool.execute(
         """
         INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
         VALUES ($1, $2, 0, 1, $3, $4)
-        """,  # Changed chunk_seq from 0 to 1
+        """,
         developer.id,
         doc.id,
-        "Different test content",
-        f"[{', '.join([str(x) for x in [0.5] * 1024])}]",
+        "Test content 1",
+        f"[{', '.join([str(0.3 + 0.4 * (i % 3) / 2) for i in range(1024)])}]",
+    )
+
+    # Insert embedding with random values between -0.8 and 0.8 
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 2, $3, $4)
+        """,
+        developer.id,
+        doc.id,
+        "Test content 2", 
+        f"[{', '.join([str(-0.8 + 1.6 * (i % 5) / 4) for i in range(1024)])}]",
+    )
+
+    # Insert embedding with alternating -1 and 1
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 3, $3, $4)
+        """,
+        developer.id,
+        doc.id,
+        "Test content 3",
+        f"[{', '.join([str(-1 if i % 2 else 1) for i in range(1024)])}]",
     )
 
     yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool)

From 25a2e6560b1aba85431cad5cc5302a627e173c8c Mon Sep 17 00:00:00 2001
From: Vedantsahai18 <Vedantsahai18@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:55:10 +0000
Subject: [PATCH 11/27] refactor: Lint agents-api (CI)

---
 agents-api/agents_api/common/nlp.py           | 15 +--
 .../queries/docs/search_docs_by_text.py       |  1 -
 agents-api/tests/fixtures.py                  |  4 +-
 agents-api/tests/test_docs_queries.py         | 96 ++++++++-----------
 4 files changed, 49 insertions(+), 67 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 233517aa8..be86d8936 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -96,7 +96,8 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]
     ent_spans = [ent for ent in doc.ents if ent.label_ not in excluded_labels]
     # Add more comprehensive stopword filtering for noun chunks
     chunk_spans = [
-        chunk for chunk in doc.noun_chunks 
+        chunk
+        for chunk in doc.noun_chunks
         if not chunk.root.is_stop and not all(token.is_stop for token in chunk)
     ]
     all_spans = filter_spans(ent_spans + chunk_spans)
@@ -109,7 +110,7 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]
         # Skip if all tokens in span are stopwords
         if all(token.is_stop for token in span):
             continue
-            
+
         text = span.text.strip()
         lower_text = text.lower()
 
@@ -194,7 +195,7 @@ def text_to_tsvector_query(
 ) -> str:
     """
     Extracts meaningful keywords/phrases from text and joins them with OR.
-    
+
     Example:
         Input: "I like basketball especially Michael Jordan"
         Output: "basketball OR Michael Jordan"
@@ -216,7 +217,7 @@ def text_to_tsvector_query(
 
     for sent in doc.sents:
         sent_doc = sent.as_doc()
-        
+
         # Extract keywords
         keywords = extract_keywords(sent_doc, top_n)
         if len(keywords) < min_keywords:
@@ -235,7 +236,7 @@ def text_to_tsvector_query(
             if len(group) > 1:
                 # Sort by length descending to prioritize longer phrases
                 sorted_group = sorted(group, key=len, reverse=True)
-                # For truly proximate multi-word groups, group words 
+                # For truly proximate multi-word groups, group words
                 queries.add(" OR ".join(sorted_group))
             else:
                 # For non-proximate words or single words, add them separately
@@ -265,7 +266,7 @@ def batch_text_to_tsvector_queries(
     results = []
 
     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
-        queries = set() # Use set to avoid duplicates
+        queries = set()  # Use set to avoid duplicates
         for sent in doc.sents:
             sent_doc = sent.as_doc()
             keywords = extract_keywords(sent_doc, top_n)
@@ -280,7 +281,7 @@ def batch_text_to_tsvector_queries(
                 if len(group) > 1:
                     # Sort by length descending to prioritize longer phrases
                     sorted_group = sorted(group, key=len, reverse=True)
-                    # For truly proximate multi-word groups, group words 
+                    # For truly proximate multi-word groups, group words
                     queries.add(" OR ".join(sorted_group))
                 else:
                     # For non-proximate words or single words, add them separately
diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py
index 44e1bb731..b1758625b 100644
--- a/agents-api/agents_api/queries/docs/search_docs_by_text.py
+++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py
@@ -7,7 +7,6 @@
 from ...autogen.openapi_model import DocReference
 from ...common.utils.db_exceptions import common_db_exceptions
 from ..utils import pg_query, rewrap_exceptions, wrap_in_class
-from ...common.nlp import text_to_tsvector_query
 from .utils import transform_to_doc_reference
 
 # Raw query for text search
diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py
index 43eb47b9a..a5dc7dc32 100644
--- a/agents-api/tests/fixtures.py
+++ b/agents-api/tests/fixtures.py
@@ -187,7 +187,7 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         f"[{', '.join([str(0.3 + 0.4 * (i % 3) / 2) for i in range(1024)])}]",
     )
 
-    # Insert embedding with random values between -0.8 and 0.8 
+    # Insert embedding with random values between -0.8 and 0.8
     await pool.execute(
         """
         INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
@@ -195,7 +195,7 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         """,
         developer.id,
         doc.id,
-        "Test content 2", 
+        "Test content 2",
         f"[{', '.join([str(-0.8 + 1.6 * (i % 5) / 4) for i in range(1024)])}]",
     )
 
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index fd82d7396..54e182bd9 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -1,5 +1,6 @@
 from agents_api.autogen.openapi_model import CreateDocRequest
 from agents_api.clients.pg import create_db_pool
+from agents_api.common.nlp import text_to_tsvector_query
 from agents_api.queries.docs.create_doc import create_doc
 from agents_api.queries.docs.delete_doc import delete_doc
 from agents_api.queries.docs.get_doc import get_doc
@@ -9,8 +10,6 @@
 from agents_api.queries.docs.search_docs_hybrid import search_docs_hybrid
 from ward import test
 
-from agents_api.common.nlp import text_to_tsvector_query
-
 from .fixtures import (
     pg_dsn,
     test_agent,
@@ -24,34 +23,36 @@
 
 import math
 
+
 def make_vector_with_similarity(n: int, d: float):
     """
     Returns a list `v` of length `n` such that the cosine similarity
     between `v` and the all-ones vector of length `n` is approximately d.
     """
     if not -1.0 <= d <= 1.0:
-        raise ValueError("d must lie in [-1, 1].")
-    
+        msg = "d must lie in [-1, 1]."
+        raise ValueError(msg)
+
     # Handle special cases exactly:
     if abs(d - 1.0) < 1e-12:  # d ~ +1
         return [1.0] * n
     if abs(d + 1.0) < 1e-12:  # d ~ -1
         return [-1.0] * n
-    if abs(d) < 1e-12:        # d ~ 0
-        v = [0.0]*n
+    if abs(d) < 1e-12:  # d ~ 0
+        v = [0.0] * n
         if n >= 2:
             v[0] = 1.0
             v[1] = -1.0
         return v
 
     sign_d = 1.0 if d >= 0 else -1.0
-    
+
     # Base part: sign(d)*[1,1,...,1]
-    base = [sign_d]*n
-    
+    base = [sign_d] * n
+
     # Orthogonal unit vector u with sum(u)=0; for simplicity:
     #   u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0]
-    u = [0.0]*n
+    u = [0.0] * n
     if n >= 2:
         u[0] = 1.0 / math.sqrt(2)
         u[1] = -1.0 / math.sqrt(2)
@@ -59,13 +60,13 @@ def make_vector_with_similarity(n: int, d: float):
 
     # Solve for alpha:
     # alpha^2 = n*(1 - d^2)/d^2
-    alpha = math.sqrt(n*(1 - d*d)) / abs(d)
+    alpha = math.sqrt(n * (1 - d * d)) / abs(d)
 
     # Construct v
-    v = [0.0]*n
+    v = [0.0] * n
     for i in range(n):
         v[i] = base[i] + alpha * u[i]
-    
+
     return v
 
 
@@ -304,6 +305,7 @@ async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer):
     assert any(d.id == doc.id for d in result), f"Should find document {doc.id}"
     assert result[0].metadata == {"test": "test"}, "Metadata should match"
 
+
 @test("query: search docs by text with technical terms and phrases")
 async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
     pool = await create_db_pool(dsn=dsn)
@@ -340,7 +342,7 @@ async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
         "API endpoints",
         "REST architecture",
         "database optimization",
-        "indexing"
+        "indexing",
     ]
 
     for query in technical_queries:
@@ -357,9 +359,14 @@ async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent):
 
         # Verify appropriate document is found based on query
         if "API" in query or "REST" in query:
-            assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'"
+            assert any(doc.id == doc1.id for doc in results), (
+                f"Doc1 should be found with query '{query}'"
+            )
         if "database" in query.lower() or "indexing" in query:
-            assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'"
+            assert any(doc.id == doc2.id for doc in results), (
+                f"Doc2 should be found with query '{query}'"
+            )
+
 
 @test("query: search docs by embedding")
 async def _(
@@ -409,75 +416,49 @@ async def _(
     assert len(result) >= 1
     assert result[0].metadata is not None
 
+
 @test("utility: test text_to_tsvector_query")
 async def _():
     test_cases = [
         # Single words
-        (
-            "test",
-            "test"
-        ),
-
+        ("test", "test"),
         # Multiple words in single sentence
         (
             "quick brown fox",
-            "quick brown fox"  # Now kept as a single phrase due to proximity
+            "quick brown fox",  # Now kept as a single phrase due to proximity
         ),
-
         # Technical terms and phrases
         (
             "Machine Learning algorithm",
-            "machine learning algorithm"  # Common technical phrase
+            "machine learning algorithm",  # Common technical phrase
         ),
         # Multiple sentences
         (
             "Machine learning is great. Data science rocks.",
-            "machine learning OR data science rocks"
+            "machine learning OR data science rocks",
         ),
-
         # Quoted phrases
         (
             '"quick brown fox"',
-            "quick brown fox"  # Quotes removed, phrase kept together
+            "quick brown fox",  # Quotes removed, phrase kept together
         ),
-        (
-            'Find "machine learning" algorithms',
-            "machine learning"
-        ),
-
+        ('Find "machine learning" algorithms', "machine learning"),
         # Multiple quoted phrases
-        (
-            '"data science" and "machine learning"',
-            "machine learning OR data science"
-        ),
-
+        ('"data science" and "machine learning"', "machine learning OR data science"),
         # Edge cases
-        (
-            "",
-            ""
-        ),
+        ("", ""),
         (
             "the and or",
-            ""  # All stop words should result in empty string
+            "",  # All stop words should result in empty string
         ),
         (
             "a",
-            ""  # Single stop word should result in empty string
+            "",  # Single stop word should result in empty string
         ),
-        (
-            "X",
-            "X"
-        ),
-
+        ("X", "X"),
         # Empty quotes
-        (
-            '""',
-            ""
-        ),
-        (
-            'test "" phrase',
-            "phrase OR test"
-        ),
+        ('""', ""),
+        ('test "" phrase', "phrase OR test"),
     ]
 
     for input_text, expected_output in test_cases:
@@ -485,8 +466,9 @@ async def _():
         result = text_to_tsvector_query(input_text)
         print(f"Generated query: '{result}'")
         print(f"Expected: '{expected_output}'\n")
-        assert result.lower() == expected_output.lower(), \
+        assert result.lower() == expected_output.lower(), (
             f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
+        )
 
 
 # @test("query: search docs by embedding with different confidence levels")

From fd2481e4812996bc312943a799198147a45c0219 Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Mon, 13 Jan 2025 18:03:26 -0500
Subject: [PATCH 12/27] chore: misc fix

---
 agents-api/tests/test_docs_queries.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index 54e182bd9..3abc91f2f 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -417,7 +417,7 @@ async def _(
     assert result[0].metadata is not None
 
 
-@test("utility: test text_to_tsvector_query")
+@test("utility: test for text_to_tsvector_query")
 async def _():
     test_cases = [
         # Single words
@@ -434,8 +434,8 @@ async def _():
         ),
         # Multiple sentences
         (
-            "Machine learning is great. Data science rocks.",
-            "machine learning OR data science rocks",
+            "I love basketball especially Michael Jordan. LeBron James is also great.",
+            ["basketball OR lebron james OR michael jordan", "LeBron James OR Michael Jordan OR basketball"],
         ),
         # Quoted phrases
         (
@@ -466,9 +466,14 @@ async def _():
         result = text_to_tsvector_query(input_text)
         print(f"Generated query: '{result}'")
         print(f"Expected: '{expected_output}'\n")
-        assert result.lower() == expected_output.lower(), (
-            f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
-        )
+        if isinstance(expected_output, list):
+            assert any(result.lower() == expected_output.lower() for expected_output in expected_output), (
+                f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
+            )
+        else:
+            assert result.lower() == expected_output.lower(), (
+                f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
+            )
 
 
 # @test("query: search docs by embedding with different confidence levels")

From 1a7eca2f872f96813cfb684cd1eea77c152c3af0 Mon Sep 17 00:00:00 2001
From: Vedantsahai18 <Vedantsahai18@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:04:21 +0000
Subject: [PATCH 13/27] refactor: Lint agents-api (CI)

---
 agents-api/tests/test_docs_queries.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index 3abc91f2f..5326c7c4b 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -435,7 +435,10 @@ async def _():
         # Multiple sentences
         (
             "I love basketball especially Michael Jordan. LeBron James is also great.",
-            ["basketball OR lebron james OR michael jordan", "LeBron James OR Michael Jordan OR basketball"],
+            [
+                "basketball OR lebron james OR michael jordan",
+                "LeBron James OR Michael Jordan OR basketball",
+            ],
         ),
         # Quoted phrases
         (
@@ -467,9 +470,9 @@ async def _():
         print(f"Generated query: '{result}'")
         print(f"Expected: '{expected_output}'\n")
         if isinstance(expected_output, list):
-            assert any(result.lower() == expected_output.lower() for expected_output in expected_output), (
-                f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
-            )
+            assert any(
+                result.lower() == expected_output.lower() for expected_output in expected_output
+            ), f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
         else:
             assert result.lower() == expected_output.lower(), (
                 f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"

From 7c9962813555b78ea6060dd699c0830d61fdb9a9 Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Mon, 13 Jan 2025 18:31:29 -0500
Subject: [PATCH 14/27] chore: test fix + add embedding vector generation based
 on the confidence

---
 agents-api/tests/fixtures.py          | 29 +++++++++++----
 agents-api/tests/test_docs_queries.py | 51 ++-------------------------
 agents-api/tests/utils.py             | 45 +++++++++++++++++++++++
 3 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py
index a5dc7dc32..919403c1f 100644
--- a/agents-api/tests/fixtures.py
+++ b/agents-api/tests/fixtures.py
@@ -45,6 +45,7 @@
 )
 from .utils import (
     patch_embed_acompletion as patch_embed_acompletion_ctx,
+    make_vector_with_similarity,
 )
 
 
@@ -164,6 +165,10 @@ async def test_doc(dsn=pg_dsn, developer=test_developer, agent=test_agent):
 @fixture(scope="test")
 async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test_doc):
     pool = await create_db_pool(dsn=dsn)
+    embedding_with_confidence_0 = make_vector_with_similarity(d=0.0)
+    embedding_with_confidence_05 = make_vector_with_similarity(d=0.5)
+    embedding_with_confidence_05_neg = make_vector_with_similarity(d=-0.5)
+    embedding_with_confidence_1_neg = make_vector_with_similarity(d=-1.0)
     await pool.execute(
         """
         INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
@@ -175,7 +180,7 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         f"[{', '.join([str(x) for x in [1.0] * 1024])}]",
     )
 
-    # Insert embedding with random values between 0.3 and 0.7
+    # Insert embedding with confidence 0 with respect to unit vector
     await pool.execute(
         """
         INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
@@ -184,10 +189,10 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         developer.id,
         doc.id,
         "Test content 1",
-        f"[{', '.join([str(0.3 + 0.4 * (i % 3) / 2) for i in range(1024)])}]",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_0])}]",
     )
 
-    # Insert embedding with random values between -0.8 and 0.8
+    # Insert embedding with confidence 0.5 with respect to unit vector
     await pool.execute(
         """
         INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
@@ -196,10 +201,10 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         developer.id,
         doc.id,
         "Test content 2",
-        f"[{', '.join([str(-0.8 + 1.6 * (i % 5) / 4) for i in range(1024)])}]",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_05])}]",
     )
 
-    # Insert embedding with alternating -1 and 1
+    # Insert embedding with confidence -0.5 with respect to unit vector
     await pool.execute(
         """
         INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
@@ -208,7 +213,19 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test
         developer.id,
         doc.id,
         "Test content 3",
-        f"[{', '.join([str(-1 if i % 2 else 1) for i in range(1024)])}]",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_05_neg])}]",
+    )
+
+    # Insert embedding with confidence -1 with respect to unit vector
+    await pool.execute(
+        """
+        INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding)
+        VALUES ($1, $2, 0, 4, $3, $4)
+        """,
+        developer.id,
+        doc.id,
+        "Test content 4",
+        f"[{', '.join([str(x) for x in embedding_with_confidence_1_neg])}]",
     )
 
     yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool)
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index 5326c7c4b..fd0246e1c 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -19,55 +19,9 @@
     test_user,
 )
 
-EMBEDDING_SIZE: int = 1024
-
-import math
-
-
-def make_vector_with_similarity(n: int, d: float):
-    """
-    Returns a list `v` of length `n` such that the cosine similarity
-    between `v` and the all-ones vector of length `n` is approximately d.
-    """
-    if not -1.0 <= d <= 1.0:
-        msg = "d must lie in [-1, 1]."
-        raise ValueError(msg)
-
-    # Handle special cases exactly:
-    if abs(d - 1.0) < 1e-12:  # d ~ +1
-        return [1.0] * n
-    if abs(d + 1.0) < 1e-12:  # d ~ -1
-        return [-1.0] * n
-    if abs(d) < 1e-12:  # d ~ 0
-        v = [0.0] * n
-        if n >= 2:
-            v[0] = 1.0
-            v[1] = -1.0
-        return v
-
-    sign_d = 1.0 if d >= 0 else -1.0
+from .utils import make_vector_with_similarity
 
-    # Base part: sign(d)*[1,1,...,1]
-    base = [sign_d] * n
-
-    # Orthogonal unit vector u with sum(u)=0; for simplicity:
-    #   u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0]
-    u = [0.0] * n
-    if n >= 2:
-        u[0] = 1.0 / math.sqrt(2)
-        u[1] = -1.0 / math.sqrt(2)
-    # (if n=1, there's no truly orthogonal vector to [1], so skip)
-
-    # Solve for alpha:
-    # alpha^2 = n*(1 - d^2)/d^2
-    alpha = math.sqrt(n * (1 - d * d)) / abs(d)
-
-    # Construct v
-    v = [0.0] * n
-    for i in range(n):
-        v[i] = base[i] + alpha * u[i]
-
-    return v
+EMBEDDING_SIZE: int = 1024
 
 
 @test("query: create user doc")
@@ -438,6 +392,7 @@ async def _():
             [
                 "basketball OR lebron james OR michael jordan",
                 "LeBron James OR Michael Jordan OR basketball",
+                "Michael Jordan OR basketball OR LeBron James"
             ],
         ),
         # Quoted phrases
diff --git a/agents-api/tests/utils.py b/agents-api/tests/utils.py
index 05544e048..95f0194ed 100644
--- a/agents-api/tests/utils.py
+++ b/agents-api/tests/utils.py
@@ -1,6 +1,7 @@
 import asyncio
 import logging
 import os
+import math
 import subprocess
 from contextlib import asynccontextmanager, contextmanager
 from unittest.mock import patch
@@ -18,6 +19,50 @@
 # Replicated here to prevent circular import
 EMBEDDING_SIZE: int = 1024
 
+def make_vector_with_similarity(n: int = EMBEDDING_SIZE, d: float = 0.5):
+    """
+    Returns a list `v` of length `n` such that the cosine similarity
+    between `v` and the all-ones vector of length `n` is approximately d.
+    """
+    if not -1.0 <= d <= 1.0:
+        msg = "d must lie in [-1, 1]."
+        raise ValueError(msg)
+
+    # Handle special cases exactly:
+    if abs(d - 1.0) < 1e-12:  # d ~ +1
+        return [1.0] * n
+    if abs(d + 1.0) < 1e-12:  # d ~ -1
+        return [-1.0] * n
+    if abs(d) < 1e-12:  # d ~ 0
+        v = [0.0] * n
+        if n >= 2:
+            v[0] = 1.0
+            v[1] = -1.0
+        return v
+
+    sign_d = 1.0 if d >= 0 else -1.0
+
+    # Base part: sign(d)*[1,1,...,1]
+    base = [sign_d] * n
+
+    # Orthogonal unit vector u with sum(u)=0; for simplicity:
+    #   u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0]
+    u = [0.0] * n
+    if n >= 2:
+        u[0] = 1.0 / math.sqrt(2)
+        u[1] = -1.0 / math.sqrt(2)
+    # (if n=1, there's no truly orthogonal vector to [1], so skip)
+
+    # Solve for alpha:
+    # alpha^2 = n*(1 - d^2)/d^2
+    alpha = math.sqrt(n * (1 - d * d)) / abs(d)
+
+    # Construct v
+    v = [0.0] * n
+    for i in range(n):
+        v[i] = base[i] + alpha * u[i]
+
+    return v
 
 @asynccontextmanager
 async def patch_testing_temporal():

From cb86135727d8c2f64060a296b4955c0a6ecf8ca1 Mon Sep 17 00:00:00 2001
From: Vedantsahai18 <Vedantsahai18@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:32:20 +0000
Subject: [PATCH 15/27] refactor: Lint agents-api (CI)

---
 agents-api/tests/fixtures.py          | 2 +-
 agents-api/tests/test_docs_queries.py | 4 +---
 agents-api/tests/utils.py             | 4 +++-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py
index 919403c1f..5b0ff68cc 100644
--- a/agents-api/tests/fixtures.py
+++ b/agents-api/tests/fixtures.py
@@ -42,10 +42,10 @@
 from .utils import (
     get_localstack,
     get_pg_dsn,
+    make_vector_with_similarity,
 )
 from .utils import (
     patch_embed_acompletion as patch_embed_acompletion_ctx,
-    make_vector_with_similarity,
 )
 
 
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index fd0246e1c..feec3a6c2 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -19,8 +19,6 @@
     test_user,
 )
 
-from .utils import make_vector_with_similarity
-
 EMBEDDING_SIZE: int = 1024
 
 
@@ -392,7 +390,7 @@ async def _():
             [
                 "basketball OR lebron james OR michael jordan",
                 "LeBron James OR Michael Jordan OR basketball",
-                "Michael Jordan OR basketball OR LeBron James"
+                "Michael Jordan OR basketball OR LeBron James",
             ],
         ),
         # Quoted phrases
diff --git a/agents-api/tests/utils.py b/agents-api/tests/utils.py
index 95f0194ed..45489befd 100644
--- a/agents-api/tests/utils.py
+++ b/agents-api/tests/utils.py
@@ -1,7 +1,7 @@
 import asyncio
 import logging
-import os
 import math
+import os
 import subprocess
 from contextlib import asynccontextmanager, contextmanager
 from unittest.mock import patch
@@ -19,6 +19,7 @@
 # Replicated here to prevent circular import
 EMBEDDING_SIZE: int = 1024
 
+
 def make_vector_with_similarity(n: int = EMBEDDING_SIZE, d: float = 0.5):
     """
     Returns a list `v` of length `n` such that the cosine similarity
@@ -64,6 +65,7 @@ def make_vector_with_similarity(n: int = EMBEDDING_SIZE, d: float = 0.5):
 
     return v
 
+
 @asynccontextmanager
 async def patch_testing_temporal():
     # Set log level to ERROR to avoid spamming the console

From 61d32bd1b68add043640e77db54b497f920014f4 Mon Sep 17 00:00:00 2001
From: Ahmad Haidar <ahmad.haidar.syr@gmail.com>
Date: Tue, 14 Jan 2025 15:27:13 +0300
Subject: [PATCH 16/27] fix(agents-api): Configure spacy for postgresql

---
 agents-api/agents_api/common/nlp.py   | 255 ++++++++------------------
 agents-api/tests/test_docs_queries.py |  20 +-
 2 files changed, 86 insertions(+), 189 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index be86d8936..62895f7f9 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -29,67 +29,33 @@
     },
 )
 
-
-# Singleton PhraseMatcher for better performance
-class KeywordMatcher:
-    _instance = None
-
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-            cls._instance.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
-            cls._instance.batch_size = 1000  # Adjust based on memory constraints
-            cls._instance.patterns_cache = {}
-        return cls._instance
-
-    @lru_cache(maxsize=10000)
-    def _create_pattern(self, text: str) -> Doc:
-        return nlp.make_doc(text)
-
-    def find_matches(self, doc: Doc, keywords: list[str]) -> dict[str, list[int]]:
-        """Batch process keywords for better performance."""
-        keyword_positions = defaultdict(list)
-
-        # Process keywords in batches to avoid memory issues
-        for i in range(0, len(keywords), self.batch_size):
-            batch = keywords[i : i + self.batch_size]
-            patterns = [self._create_pattern(kw) for kw in batch]
-
-            # Clear previous patterns and add new batch
-            if "KEYWORDS" in self.matcher:
-                self.matcher.remove("KEYWORDS")
-            self.matcher.add("KEYWORDS", patterns)
-
-            # Find matches for this batch
-            matches = self.matcher(doc)
-            for match_id, start, end in matches:
-                span_text = doc[start:end].text
-                normalized = WHITESPACE_RE.sub(" ", span_text).lower().strip()
-                keyword_positions[normalized].append(start)
-
-        return keyword_positions
-
-
-# Initialize global matcher
-keyword_matcher = KeywordMatcher()
-
-
 @lru_cache(maxsize=10000)
 def clean_keyword(kw: str) -> str:
     """Cache cleaned keywords for reuse."""
     return NON_ALPHANUM_RE.sub("", kw).strip()
 
 
-def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]:
+def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
-        "DATE",
-        "TIME",
-        "PERCENT",
-        "MONEY",
-        "QUANTITY",
-        "ORDINAL",
-        "CARDINAL",
+        "DATE",        # Absolute or relative dates or periods.
+        "TIME",        # Times smaller than a day.
+        "PERCENT",     # Percentage, including ”%“.
+        "MONEY",       # Monetary values, including unit.
+        "QUANTITY",    # Measurements, as of weight or distance.
+        "ORDINAL",     # “first”, “second”, etc.
+        "CARDINAL",    # Numerals that do not fall under another type.
+        # "PERSON",      # People, including fictional.
+        # "NORP",        # Nationalities or religious or political groups.
+        # "FAC",         # Buildings, airports, highways, bridges, etc.
+        # "ORG",         # Companies, agencies, institutions, etc.
+        # "GPE",         # Countries, cities, states.
+        # "LOC",         # Non-GPE locations, mountain ranges, bodies of water.
+        # "PRODUCT",     # Objects, vehicles, foods, etc. (Not services.)
+        # "EVENT",       # Named hurricanes, battles, wars, sports events, etc.
+        # "WORK_OF_ART", # Titles of books, songs, etc.
+        # "LAW",         # Named documents made into laws.
+        # "LANGUAGE",    # Any named language.
     }
 
     # Extract and filter spans in a single pass
@@ -104,8 +70,12 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]
 
     # Process spans efficiently and filter out spans that are entirely stopwords
     keywords = []
+    ent_keywords = []
     seen_texts = set()
 
+    # Convert ent_spans to set for faster lookup
+    ent_spans_set = set(ent_spans)
+
     for span in all_spans:
         # Skip if all tokens in span are stopwords
         if all(token.is_stop for token in span):
@@ -119,79 +89,30 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]
             continue
 
         seen_texts.add(lower_text)
-        keywords.append(text)
+        ent_keywords.append(text) if span in ent_spans_set else keywords.append(text)
+
 
     # Normalize keywords by replacing multiple spaces with single space and stripping
+    normalized_ent_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in ent_keywords]
     normalized_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in keywords]
 
     # Count frequencies efficiently
+    ent_freq = Counter(normalized_ent_keywords)
     freq = Counter(normalized_keywords)
-    top_keywords = [kw for kw, _ in freq.most_common(top_n)]
+
+
+    top_keywords = [kw for kw, _ in ent_freq.most_common(top_n)]
+    remaining_slots = max(0, top_n - len(top_keywords))
+    top_keywords += [kw for kw, _ in freq.most_common(remaining_slots)]
 
     if clean:
         return [clean_keyword(kw) for kw in top_keywords]
     return top_keywords
 
 
-def find_proximity_groups(
-    keywords: list[str], keyword_positions: dict[str, list[int]], n: int = 10
-) -> list[set[str]]:
-    """Optimized proximity grouping using sorted positions."""
-    # Early return for single or no keywords
-    if len(keywords) <= 1:
-        return [{kw} for kw in keywords]
-
-    # Create flat list of positions for efficient processing
-    positions: list[tuple[int, str]] = [
-        (pos, kw) for kw in keywords for pos in keyword_positions[kw]
-    ]
-
-    # Sort positions once
-    positions.sort()
-
-    # Initialize Union-Find with path compression and union by rank
-    parent = {kw: kw for kw in keywords}
-    rank = dict.fromkeys(keywords, 0)
-
-    def find(u: str) -> str:
-        if parent[u] != u:
-            parent[u] = find(parent[u])
-        return parent[u]
-
-    def union(u: str, v: str) -> None:
-        u_root, v_root = find(u), find(v)
-        if u_root != v_root:
-            if rank[u_root] < rank[v_root]:
-                u_root, v_root = v_root, u_root
-            parent[v_root] = u_root
-            if rank[u_root] == rank[v_root]:
-                rank[u_root] += 1
-
-    # Use sliding window for proximity checking
-    window = []
-    for pos, kw in positions:
-        # Remove positions outside window
-        while window and pos - window[0][0] > n:
-            window.pop(0)
-
-        # Union with all keywords in window
-        for _, w_kw in window:
-            union(kw, w_kw)
-
-        window.append((pos, kw))
-
-    # Group keywords efficiently
-    groups = defaultdict(set)
-    for kw in keywords:
-        root = find(kw)
-        groups[root].add(kw)
-
-    return list(groups.values())
-
-
 @lru_cache(maxsize=1000)
 def text_to_tsvector_query(
-    paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1
+    paragraph: str, top_n: int = 25,  min_keywords: int = 1
 ) -> str:
     """
     Extracts meaningful keywords/phrases from text and joins them with OR.
@@ -203,7 +124,6 @@ def text_to_tsvector_query(
     Args:
         paragraph (str): The input text to process
         top_n (int): Number of top keywords to extract per sentence
-        proximity_n (int): The proximity window for grouping related keywords
         min_keywords (int): Minimum number of keywords required
 
     Returns:
@@ -223,71 +143,54 @@ def text_to_tsvector_query(
         if len(keywords) < min_keywords:
             continue
 
-        # Find keyword positions
-        keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
-        if not keyword_positions:
-            continue
-
-        # Group related keywords by proximity
-        groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-
-        # Add each group as a single term to our set
-        for group in groups:
-            if len(group) > 1:
-                # Sort by length descending to prioritize longer phrases
-                sorted_group = sorted(group, key=len, reverse=True)
-                # For truly proximate multi-word groups, group words
-                queries.add(" OR ".join(sorted_group))
-            else:
-                # For non-proximate words or single words, add them separately
-                queries.update(group)
+        queries.add(" OR ".join(keywords))
 
     # Join all terms with " OR "
     return " OR ".join(queries) if queries else ""
 
 
-def batch_text_to_tsvector_queries(
-    paragraphs: list[str],
-    top_n: int = 10,
-    proximity_n: int = 10,
-    min_keywords: int = 1,
-    n_process: int = 1,
-) -> list[str]:
-    """
-    Processes multiple paragraphs using nlp.pipe for better performance.
-
-    Args:
-        paragraphs (list[str]): List of paragraphs to process
-        top_n (int): Number of top keywords to include per paragraph
-
-    Returns:
-        list[str]: List of tsquery strings
-    """
-    results = []
-
-    for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
-        queries = set()  # Use set to avoid duplicates
-        for sent in doc.sents:
-            sent_doc = sent.as_doc()
-            keywords = extract_keywords(sent_doc, top_n)
-            if len(keywords) < min_keywords:
-                continue
-            keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
-            if not keyword_positions:
-                continue
-            groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-            # Add each group as a single term to our set
-            for group in groups:
-                if len(group) > 1:
-                    # Sort by length descending to prioritize longer phrases
-                    sorted_group = sorted(group, key=len, reverse=True)
-                    # For truly proximate multi-word groups, group words
-                    queries.add(" OR ".join(sorted_group))
-                else:
-                    # For non-proximate words or single words, add them separately
-                    queries.update(group)
-
-        # Join all terms with " OR "
-        results.append(" OR ".join(queries) if queries else "")
-
-    return results
+# def batch_text_to_tsvector_queries(
+#     paragraphs: list[str],
+#     top_n: int = 10,
+#     proximity_n: int = 10,
+#     min_keywords: int = 1,
+#     n_process: int = 1,
+# ) -> list[str]:
+#     """
+#     Processes multiple paragraphs using nlp.pipe for better performance.
+
+#     Args:
+#         paragraphs (list[str]): List of paragraphs to process
+#         top_n (int): Number of top keywords to include per paragraph
+
+#     Returns:
+#         list[str]: List of tsquery strings
+#     """
+#     results = []
+
+#     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
+#         queries = set()  # Use set to avoid duplicates
+#         for sent in doc.sents:
+#             sent_doc = sent.as_doc()
+#             keywords = extract_keywords(sent_doc, top_n)
+#             if len(keywords) < min_keywords:
+#                 continue
+#             keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
+#             if not keyword_positions:
+#                 continue
+#             groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
+#             # Add each group as a single term to our set
+#             for group in groups:
+#                 if len(group) > 1:
+#                     # Sort by length descending to prioritize longer phrases
+#                     sorted_group = sorted(group, key=len, reverse=True)
+#                     # For truly proximate multi-word groups, group words
+#                     queries.add(" OR ".join(sorted_group))
+#                 else:
+#                     # For non-proximate words or single words, add them separately
+#                     queries.update(group)
+
+#         # Join all terms with " OR "
+#         results.append(" OR ".join(queries) if queries else "")
+
+#     return results
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index feec3a6c2..fea7f4fbf 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -387,11 +387,7 @@ async def _():
         # Multiple sentences
         (
             "I love basketball especially Michael Jordan. LeBron James is also great.",
-            [
-                "basketball OR lebron james OR michael jordan",
-                "LeBron James OR Michael Jordan OR basketball",
-                "Michael Jordan OR basketball OR LeBron James",
-            ],
+            "basketball OR lebron james OR michael jordan",
         ),
         # Quoted phrases
         (
@@ -422,14 +418,12 @@ async def _():
         result = text_to_tsvector_query(input_text)
         print(f"Generated query: '{result}'")
         print(f"Expected: '{expected_output}'\n")
-        if isinstance(expected_output, list):
-            assert any(
-                result.lower() == expected_output.lower() for expected_output in expected_output
-            ), f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
-        else:
-            assert result.lower() == expected_output.lower(), (
-                f"Expected '{expected_output}' but got '{result}' for input '{input_text}'"
-            )
+           
+        result_terms = set(term.lower() for term in result.split(" OR ") if term)
+        expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term)
+        assert result_terms == expected_terms, (
+            f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'"
+        )
 
 
 # @test("query: search docs by embedding with different confidence levels")

From 890880bb67b776c424624e910ca49c0a045ad45f Mon Sep 17 00:00:00 2001
From: Ahmad-mtos <Ahmad-mtos@users.noreply.github.com>
Date: Tue, 14 Jan 2025 12:28:09 +0000
Subject: [PATCH 17/27] refactor: Lint agents-api (CI)

---
 agents-api/agents_api/common/nlp.py   | 24 ++++++++++--------------
 agents-api/tests/test_docs_queries.py |  6 +++---
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 62895f7f9..26cba72ce 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -1,9 +1,8 @@
 import re
-from collections import Counter, defaultdict
+from collections import Counter
 from functools import lru_cache
 
 import spacy
-from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 from spacy.util import filter_spans
 
@@ -29,6 +28,7 @@
     },
 )
 
+
 @lru_cache(maxsize=10000)
 def clean_keyword(kw: str) -> str:
     """Cache cleaned keywords for reuse."""
@@ -38,13 +38,13 @@ def clean_keyword(kw: str) -> str:
 def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
-        "DATE",        # Absolute or relative dates or periods.
-        "TIME",        # Times smaller than a day.
-        "PERCENT",     # Percentage, including ”%“.
-        "MONEY",       # Monetary values, including unit.
-        "QUANTITY",    # Measurements, as of weight or distance.
-        "ORDINAL",     # “first”, “second”, etc.
-        "CARDINAL",    # Numerals that do not fall under another type.
+        "DATE",  # Absolute or relative dates or periods.
+        "TIME",  # Times smaller than a day.
+        "PERCENT",  # Percentage, including ”%“.
+        "MONEY",  # Monetary values, including unit.
+        "QUANTITY",  # Measurements, as of weight or distance.
+        "ORDINAL",  # “first”, “second”, etc.
+        "CARDINAL",  # Numerals that do not fall under another type.
         # "PERSON",      # People, including fictional.
         # "NORP",        # Nationalities or religious or political groups.
         # "FAC",         # Buildings, airports, highways, bridges, etc.
@@ -91,7 +91,6 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]
         seen_texts.add(lower_text)
         ent_keywords.append(text) if span in ent_spans_set else keywords.append(text)
 
-
     # Normalize keywords by replacing multiple spaces with single space and stripping
     normalized_ent_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in ent_keywords]
     normalized_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in keywords]
@@ -100,7 +99,6 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]
     ent_freq = Counter(normalized_ent_keywords)
     freq = Counter(normalized_keywords)
 
-
     top_keywords = [kw for kw, _ in ent_freq.most_common(top_n)]
     remaining_slots = max(0, top_n - len(top_keywords))
     top_keywords += [kw for kw, _ in freq.most_common(remaining_slots)]
@@ -111,9 +109,7 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]
 
 
 @lru_cache(maxsize=1000)
-def text_to_tsvector_query(
-    paragraph: str, top_n: int = 25,  min_keywords: int = 1
-) -> str:
+def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int = 1) -> str:
     """
     Extracts meaningful keywords/phrases from text and joins them with OR.
 
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index fea7f4fbf..d4d685c1d 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -418,9 +418,9 @@ async def _():
         result = text_to_tsvector_query(input_text)
         print(f"Generated query: '{result}'")
         print(f"Expected: '{expected_output}'\n")
-           
-        result_terms = set(term.lower() for term in result.split(" OR ") if term)
-        expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term)
+
+        result_terms = {term.lower() for term in result.split(" OR ") if term}
+        expected_terms = {term.lower() for term in expected_output.split(" OR ") if term}
         assert result_terms == expected_terms, (
             f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'"
         )

From 27ed1f4a2944f7afb5e5a1ab483693973187297e Mon Sep 17 00:00:00 2001
From: vedantsahai18 <vedantsahai18@gmail.com>
Date: Tue, 14 Jan 2025 17:38:59 -0500
Subject: [PATCH 18/27] chore: misc refactor

---
 agents-api/agents_api/common/nlp.py | 30 ++++++-----------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index be86d8936..6850be6a9 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -263,31 +263,13 @@ def batch_text_to_tsvector_queries(
     Returns:
         list[str]: List of tsquery strings
     """
-    results = []
+    # Use a set to avoid duplicates
+    results = set()
 
     for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process):
-        queries = set()  # Use set to avoid duplicates
-        for sent in doc.sents:
-            sent_doc = sent.as_doc()
-            keywords = extract_keywords(sent_doc, top_n)
-            if len(keywords) < min_keywords:
-                continue
-            keyword_positions = keyword_matcher.find_matches(sent_doc, keywords)
-            if not keyword_positions:
-                continue
-            groups = find_proximity_groups(keywords, keyword_positions, proximity_n)
-            # Add each group as a single term to our set
-            for group in groups:
-                if len(group) > 1:
-                    # Sort by length descending to prioritize longer phrases
-                    sorted_group = sorted(group, key=len, reverse=True)
-                    # For truly proximate multi-word groups, group words
-                    queries.add(" OR ".join(sorted_group))
-                else:
-                    # For non-proximate words or single words, add them separately
-                    queries.update(group)
-
-        # Join all terms with " OR "
-        results.append(" OR ".join(queries) if queries else "")
+        # Generate tsquery string for each paragraph
+        queries = text_to_tsvector_query(doc, top_n, proximity_n, min_keywords)
+        # Add to results set
+        results.add(queries)
 
     return results

From 6a07a54b47b597e922bee597932f6ec2f006790e Mon Sep 17 00:00:00 2001
From: Vedant Sahai <vedantsahai18@gmail.com>
Date: Tue, 14 Jan 2025 17:40:03 -0500
Subject: [PATCH 19/27] Update agents-api/agents_api/common/nlp.py

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
---
 agents-api/agents_api/common/nlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 6850be6a9..e01af83b6 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -272,4 +272,4 @@ def batch_text_to_tsvector_queries(
         # Add to results set
         results.add(queries)
 
-    return results
+    return list(results)

From 68a7a05b4557e8627d5ab6cdce94f09d085924cc Mon Sep 17 00:00:00 2001
From: Ahmad Haidar <ahmad.haidar.syr@gmail.com>
Date: Wed, 15 Jan 2025 11:40:32 +0300
Subject: [PATCH 20/27] fix(agents-api): add split chunks option + nlp tests

---
 agents-api/agents_api/common/nlp.py    |  23 +++-
 agents-api/tests/test_docs_queries.py  |  57 ----------
 agents-api/tests/test_nlp_utilities.py | 147 +++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 62 deletions(-)
 create mode 100644 agents-api/tests/test_nlp_utilities.py

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 26cba72ce..c49928508 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -9,6 +9,7 @@
 # Precompile regex patterns
 WHITESPACE_RE = re.compile(r"\s+")
 NON_ALPHANUM_RE = re.compile(r"[^\w\s\-_]+")
+LONE_HYPHEN_RE = re.compile(r'\s*-\s*(?!\w)|(?<!\w)\s*-\s*')
 
 # Initialize spaCy with minimal pipeline
 nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"])
@@ -32,10 +33,16 @@
 @lru_cache(maxsize=10000)
 def clean_keyword(kw: str) -> str:
     """Cache cleaned keywords for reuse."""
-    return NON_ALPHANUM_RE.sub("", kw).strip()
+    # First remove non-alphanumeric chars (except whitespace, hyphens, underscores)
+    cleaned = NON_ALPHANUM_RE.sub("", kw).strip()
+    # Replace lone hyphens with spaces
+    cleaned = LONE_HYPHEN_RE.sub(" ", cleaned)
+    # Clean up any resulting multiple spaces
+    cleaned = WHITESPACE_RE.sub(" ", cleaned).strip()
+    return cleaned
 
 
-def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]:
+def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
         "DATE",  # Absolute or relative dates or periods.
@@ -95,6 +102,9 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]
     normalized_ent_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in ent_keywords]
     normalized_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in keywords]
 
+    if split_chunks:
+        normalized_keywords = [word for kw in normalized_keywords for word in kw.split()]
+
     # Count frequencies efficiently
     ent_freq = Counter(normalized_ent_keywords)
     freq = Counter(normalized_keywords)
@@ -109,7 +119,9 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]
 
 
 @lru_cache(maxsize=1000)
-def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int = 1) -> str:
+def text_to_tsvector_query(
+    paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False
+) -> str:
     """
     Extracts meaningful keywords/phrases from text and joins them with OR.
 
@@ -121,6 +133,7 @@ def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int =
         paragraph (str): The input text to process
         top_n (int): Number of top keywords to extract per sentence
         min_keywords (int): Minimum number of keywords required
+        split_chunks (bool): If True, breaks multi-word noun chunks into individual words
 
     Returns:
         str: Keywords/phrases joined by OR
@@ -135,11 +148,11 @@ def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int =
         sent_doc = sent.as_doc()
 
         # Extract keywords
-        keywords = extract_keywords(sent_doc, top_n)
+        keywords = extract_keywords(sent_doc, top_n, split_chunks=split_chunks)
         if len(keywords) < min_keywords:
             continue
 
-        queries.add(" OR ".join(keywords))
+        queries.update(keywords)
 
     # Join all terms with " OR "
     return " OR ".join(queries) if queries else ""
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index d4d685c1d..82147a398 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -369,63 +369,6 @@ async def _(
     assert result[0].metadata is not None
 
 
-@test("utility: test for text_to_tsvector_query")
-async def _():
-    test_cases = [
-        # Single words
-        ("test", "test"),
-        # Multiple words in single sentence
-        (
-            "quick brown fox",
-            "quick brown fox",  # Now kept as a single phrase due to proximity
-        ),
-        # Technical terms and phrases
-        (
-            "Machine Learning algorithm",
-            "machine learning algorithm",  # Common technical phrase
-        ),
-        # Multiple sentences
-        (
-            "I love basketball especially Michael Jordan. LeBron James is also great.",
-            "basketball OR lebron james OR michael jordan",
-        ),
-        # Quoted phrases
-        (
-            '"quick brown fox"',
-            "quick brown fox",  # Quotes removed, phrase kept together
-        ),
-        ('Find "machine learning" algorithms', "machine learning"),
-        # Multiple quoted phrases
-        ('"data science" and "machine learning"', "machine learning OR data science"),
-        # Edge cases
-        ("", ""),
-        (
-            "the and or",
-            "",  # All stop words should result in empty string
-        ),
-        (
-            "a",
-            "",  # Single stop word should result in empty string
-        ),
-        ("X", "X"),
-        # Empty quotes
-        ('""', ""),
-        ('test "" phrase', "phrase OR test"),
-    ]
-
-    for input_text, expected_output in test_cases:
-        print(f"Input: '{input_text}'")
-        result = text_to_tsvector_query(input_text)
-        print(f"Generated query: '{result}'")
-        print(f"Expected: '{expected_output}'\n")
-
-        result_terms = {term.lower() for term in result.split(" OR ") if term}
-        expected_terms = {term.lower() for term in expected_output.split(" OR ") if term}
-        assert result_terms == expected_terms, (
-            f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'"
-        )
-
-
 # @test("query: search docs by embedding with different confidence levels")
 # async def _(
 #     dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding
diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py
new file mode 100644
index 000000000..63d7f126c
--- /dev/null
+++ b/agents-api/tests/test_nlp_utilities.py
@@ -0,0 +1,147 @@
+from agents_api.common.nlp import text_to_tsvector_query, clean_keyword, extract_keywords
+import spacy
+
+from ward import test
+
+@test("utility: clean_keyword")
+async def _():
+    assert clean_keyword("Hello, World!") == "Hello World"
+    
+    # Basic cleaning
+    # assert clean_keyword("test@example.com") == "test example com"
+    assert clean_keyword("user-name_123") == "user-name_123"
+    assert clean_keyword("  spaces  ") == "spaces"
+    
+    # Special characters
+    assert clean_keyword("$price: 100%") == "price 100"
+    assert clean_keyword("#hashtag!") == "hashtag"
+    
+    # Multiple spaces and punctuation
+    assert clean_keyword("multiple,   spaces...") == "multiple spaces"
+    
+    # Empty and whitespace
+    assert clean_keyword("") == ""
+    assert clean_keyword("   ") == ""
+
+    assert clean_keyword("- try") == "try"
+
+@test("utility: extract_keywords")
+async def _():
+    nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"])
+    doc = nlp("John Doe is a software engineer at Google.")
+    assert set(extract_keywords(doc)) == {"John Doe", "a software engineer", "Google"}
+
+@test("utility: text_to_tsvector_query - split_chunks=False")
+async def _():
+    test_cases = [
+        # Single words
+        ("test", "test"),
+        # Multiple words in single sentence
+        (
+            "quick brown fox",
+            "quick brown fox",  # Now kept as a single phrase due to proximity
+        ),
+        # Technical terms and phrases
+        (
+            "Machine Learning algorithm",
+            "machine learning algorithm",  # Common technical phrase
+        ),
+        # Multiple sentences
+        (
+            "I love basketball especially Michael Jordan. LeBron James is also great.",
+            "basketball OR lebron james OR michael jordan",
+        ),
+        # Quoted phrases
+        (
+            '"quick brown fox"',
+            "quick brown fox",  # Quotes removed, phrase kept together
+        ),
+        ('Find "machine learning" algorithms', "machine learning"),
+        # Multiple quoted phrases
+        ('"data science" and "machine learning"', "machine learning OR data science"),
+        # Edge cases
+        ("", ""),
+        (
+            "the and or",
+            "",  # All stop words should result in empty string
+        ),
+        (
+            "a",
+            "",  # Single stop word should result in empty string
+        ),
+        ("X", "X"),
+        # Empty quotes
+        ('""', ""),
+        ('test "" phrase', "phrase OR test"),
+        ("John Doe is a software engineer at Google.", "google OR john doe OR a software engineer"),
+        ("- google", "google"),
+    ]
+
+    for input_text, expected_output in test_cases:
+        print(f"Input: '{input_text}'")
+        result = text_to_tsvector_query(input_text, split_chunks=False)
+        print(f"Generated query: '{result}'")
+        print(f"Expected: '{expected_output}'\n")
+           
+        result_terms = set(term.lower() for term in result.split(" OR ") if term)
+        expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term)
+        assert result_terms == expected_terms, (
+            f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'"
+        )
+
+@test("utility: text_to_tsvector_query - split_chunks=True")
+async def _():
+    test_cases = [
+        # Single words
+        ("test", "test"),
+        # Multiple words in single sentence
+        (
+            "quick brown fox",
+            "quick OR brown OR fox",  # Now kept as a single phrase due to proximity
+        ),
+        # Technical terms and phrases
+        (
+            "Machine Learning algorithm",
+            "machine OR learning OR algorithm",  # Common technical phrase
+        ),
+        # Multiple sentences
+        (
+            "I love basketball especially Michael Jordan. LeBron James is also great.",
+            "basketball OR lebron james OR michael jordan",
+        ),
+        # Quoted phrases
+        (
+            '"quick brown fox"',
+            "quick OR brown OR fox",  # Quotes removed, phrase kept together
+        ),
+        ('Find "machine learning" algorithms', "machine OR learning"),
+        # Multiple quoted phrases
+        ('"data science" and "machine learning"', "machine OR learning OR data OR science"),
+        # Edge cases
+        ("", ""),
+        (
+            "the and or",
+            "",  # All stop words should result in empty string
+        ),
+        (
+            "a",
+            "",  # Single stop word should result in empty string
+        ),
+        ("X", "X"),
+        # Empty quotes
+        ('""', ""),
+        ('test "" phrase', "phrase OR test"),
+        ("John Doe is a software engineer at Google.", "google OR john doe OR a OR software OR engineer"),
+    ]
+
+    for input_text, expected_output in test_cases:
+        print(f"Input: '{input_text}'")
+        result = text_to_tsvector_query(input_text, split_chunks=True)
+        print(f"Generated query: '{result}'")
+        print(f"Expected: '{expected_output}'\n")
+           
+        result_terms = set(term.lower() for term in result.split(" OR ") if term)
+        expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term)
+        assert result_terms == expected_terms, (
+            f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'"
+        )
\ No newline at end of file

From 3fa200c74f85f35c66d8019b5b4daf7eca397f17 Mon Sep 17 00:00:00 2001
From: Ahmad-mtos <Ahmad-mtos@users.noreply.github.com>
Date: Wed, 15 Jan 2025 08:41:26 +0000
Subject: [PATCH 21/27] refactor: Lint agents-api (CI)

---
 agents-api/agents_api/common/nlp.py    |  9 +++---
 agents-api/tests/test_docs_queries.py  |  1 -
 agents-api/tests/test_nlp_utilities.py | 39 ++++++++++++++++----------
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index c49928508..6c5f49f74 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -9,7 +9,7 @@
 # Precompile regex patterns
 WHITESPACE_RE = re.compile(r"\s+")
 NON_ALPHANUM_RE = re.compile(r"[^\w\s\-_]+")
-LONE_HYPHEN_RE = re.compile(r'\s*-\s*(?!\w)|(?<!\w)\s*-\s*')
+LONE_HYPHEN_RE = re.compile(r"\s*-\s*(?!\w)|(?<!\w)\s*-\s*")
 
 # Initialize spaCy with minimal pipeline
 nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"])
@@ -38,11 +38,12 @@ def clean_keyword(kw: str) -> str:
     # Replace lone hyphens with spaces
     cleaned = LONE_HYPHEN_RE.sub(" ", cleaned)
     # Clean up any resulting multiple spaces
-    cleaned = WHITESPACE_RE.sub(" ", cleaned).strip()
-    return cleaned
+    return WHITESPACE_RE.sub(" ", cleaned).strip()
 
 
-def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False) -> list[str]:
+def extract_keywords(
+    doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False
+) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
         "DATE",  # Absolute or relative dates or periods.
diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py
index 82147a398..7782b3bf7 100644
--- a/agents-api/tests/test_docs_queries.py
+++ b/agents-api/tests/test_docs_queries.py
@@ -1,6 +1,5 @@
 from agents_api.autogen.openapi_model import CreateDocRequest
 from agents_api.clients.pg import create_db_pool
-from agents_api.common.nlp import text_to_tsvector_query
 from agents_api.queries.docs.create_doc import create_doc
 from agents_api.queries.docs.delete_doc import delete_doc
 from agents_api.queries.docs.get_doc import get_doc
diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py
index 63d7f126c..82c22911a 100644
--- a/agents-api/tests/test_nlp_utilities.py
+++ b/agents-api/tests/test_nlp_utilities.py
@@ -1,36 +1,38 @@
-from agents_api.common.nlp import text_to_tsvector_query, clean_keyword, extract_keywords
 import spacy
-
+from agents_api.common.nlp import clean_keyword, extract_keywords, text_to_tsvector_query
 from ward import test
 
+
 @test("utility: clean_keyword")
 async def _():
     assert clean_keyword("Hello, World!") == "Hello World"
-    
+
     # Basic cleaning
     # assert clean_keyword("test@example.com") == "test example com"
     assert clean_keyword("user-name_123") == "user-name_123"
     assert clean_keyword("  spaces  ") == "spaces"
-    
+
     # Special characters
     assert clean_keyword("$price: 100%") == "price 100"
     assert clean_keyword("#hashtag!") == "hashtag"
-    
+
     # Multiple spaces and punctuation
     assert clean_keyword("multiple,   spaces...") == "multiple spaces"
-    
+
     # Empty and whitespace
     assert clean_keyword("") == ""
     assert clean_keyword("   ") == ""
 
     assert clean_keyword("- try") == "try"
 
+
 @test("utility: extract_keywords")
 async def _():
     nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"])
     doc = nlp("John Doe is a software engineer at Google.")
     assert set(extract_keywords(doc)) == {"John Doe", "a software engineer", "Google"}
 
+
 @test("utility: text_to_tsvector_query - split_chunks=False")
 async def _():
     test_cases = [
@@ -73,7 +75,10 @@ async def _():
         # Empty quotes
         ('""', ""),
         ('test "" phrase', "phrase OR test"),
-        ("John Doe is a software engineer at Google.", "google OR john doe OR a software engineer"),
+        (
+            "John Doe is a software engineer at Google.",
+            "google OR john doe OR a software engineer",
+        ),
         ("- google", "google"),
     ]
 
@@ -82,13 +87,14 @@ async def _():
         result = text_to_tsvector_query(input_text, split_chunks=False)
         print(f"Generated query: '{result}'")
         print(f"Expected: '{expected_output}'\n")
-           
-        result_terms = set(term.lower() for term in result.split(" OR ") if term)
-        expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term)
+
+        result_terms = {term.lower() for term in result.split(" OR ") if term}
+        expected_terms = {term.lower() for term in expected_output.split(" OR ") if term}
         assert result_terms == expected_terms, (
             f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'"
         )
 
+
 @test("utility: text_to_tsvector_query - split_chunks=True")
 async def _():
     test_cases = [
@@ -131,7 +137,10 @@ async def _():
         # Empty quotes
         ('""', ""),
         ('test "" phrase', "phrase OR test"),
-        ("John Doe is a software engineer at Google.", "google OR john doe OR a OR software OR engineer"),
+        (
+            "John Doe is a software engineer at Google.",
+            "google OR john doe OR a OR software OR engineer",
+        ),
     ]
 
     for input_text, expected_output in test_cases:
@@ -139,9 +148,9 @@ async def _():
         result = text_to_tsvector_query(input_text, split_chunks=True)
         print(f"Generated query: '{result}'")
         print(f"Expected: '{expected_output}'\n")
-           
-        result_terms = set(term.lower() for term in result.split(" OR ") if term)
-        expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term)
+
+        result_terms = {term.lower() for term in result.split(" OR ") if term}
+        expected_terms = {term.lower() for term in expected_output.split(" OR ") if term}
         assert result_terms == expected_terms, (
             f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'"
-        )
\ No newline at end of file
+        )

From 2c25490dbc04f34e24d062e1fdb5bf0fc55cfd1e Mon Sep 17 00:00:00 2001
From: Ahmad Haidar <ahmad.haidar.syr@gmail.com>
Date: Wed, 15 Jan 2025 11:45:15 +0300
Subject: [PATCH 22/27] chore(agents-api): utilize ``text_to_tsvector_query``
 in search queries

---
 agents-api/agents_api/queries/docs/search_docs_by_text.py | 3 ++-
 agents-api/agents_api/queries/docs/search_docs_hybrid.py  | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py
index b1758625b..0a48123d2 100644
--- a/agents-api/agents_api/queries/docs/search_docs_by_text.py
+++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py
@@ -8,6 +8,7 @@
 from ...common.utils.db_exceptions import common_db_exceptions
 from ..utils import pg_query, rewrap_exceptions, wrap_in_class
 from .utils import transform_to_doc_reference
+from ...common.nlp import text_to_tsvector_query
 
 # Raw query for text search
 search_docs_text_query = """
@@ -61,7 +62,7 @@ async def search_docs_by_text(
     owner_types: list[str] = [owner[0] for owner in owners]
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
     #  Pre-process rawtext query
-    # query = text_to_tsvector_query(query)
+    query = text_to_tsvector_query(query)
 
     return (
         search_docs_text_query,
diff --git a/agents-api/agents_api/queries/docs/search_docs_hybrid.py b/agents-api/agents_api/queries/docs/search_docs_hybrid.py
index fe68bc075..d33347db7 100644
--- a/agents-api/agents_api/queries/docs/search_docs_hybrid.py
+++ b/agents-api/agents_api/queries/docs/search_docs_hybrid.py
@@ -6,6 +6,7 @@
 
 from ...autogen.openapi_model import DocReference
 from ...common.utils.db_exceptions import common_db_exceptions
+from ...common.nlp import text_to_tsvector_query
 from ..utils import (
     pg_query,
     rewrap_exceptions,
@@ -81,6 +82,9 @@ async def search_docs_hybrid(
     owner_types: list[str] = [owner[0] for owner in owners]
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
 
+    # Pre-process rawtext query
+    text_query = text_to_tsvector_query(text_query)
+
     return (
         search_docs_hybrid_query,
         [

From 363c7c63cdcd2357bd4f613c4c3775002b9af355 Mon Sep 17 00:00:00 2001
From: Ahmad-mtos <Ahmad-mtos@users.noreply.github.com>
Date: Wed, 15 Jan 2025 08:46:07 +0000
Subject: [PATCH 23/27] refactor: Lint agents-api (CI)

---
 agents-api/agents_api/queries/docs/search_docs_by_text.py | 2 +-
 agents-api/agents_api/queries/docs/search_docs_hybrid.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py
index 0a48123d2..b98906466 100644
--- a/agents-api/agents_api/queries/docs/search_docs_by_text.py
+++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py
@@ -5,10 +5,10 @@
 from fastapi import HTTPException
 
 from ...autogen.openapi_model import DocReference
+from ...common.nlp import text_to_tsvector_query
 from ...common.utils.db_exceptions import common_db_exceptions
 from ..utils import pg_query, rewrap_exceptions, wrap_in_class
 from .utils import transform_to_doc_reference
-from ...common.nlp import text_to_tsvector_query
 
 # Raw query for text search
 search_docs_text_query = """
diff --git a/agents-api/agents_api/queries/docs/search_docs_hybrid.py b/agents-api/agents_api/queries/docs/search_docs_hybrid.py
index d33347db7..1f7c363c4 100644
--- a/agents-api/agents_api/queries/docs/search_docs_hybrid.py
+++ b/agents-api/agents_api/queries/docs/search_docs_hybrid.py
@@ -5,8 +5,8 @@
 from fastapi import HTTPException
 
 from ...autogen.openapi_model import DocReference
-from ...common.utils.db_exceptions import common_db_exceptions
 from ...common.nlp import text_to_tsvector_query
+from ...common.utils.db_exceptions import common_db_exceptions
 from ..utils import (
     pg_query,
     rewrap_exceptions,

From 9eb018f49469fc926da92a0d63b9557282a4f7f8 Mon Sep 17 00:00:00 2001
From: Ahmad Haidar <ahmad.haidar.syr@gmail.com>
Date: Wed, 15 Jan 2025 12:03:56 +0300
Subject: [PATCH 24/27] chore(agents-api): remove clean parameter from
 ``extract_keywords``

---
 agents-api/agents_api/common/nlp.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 6c5f49f74..aea2380d8 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -42,7 +42,7 @@ def clean_keyword(kw: str) -> str:
 
 
 def extract_keywords(
-    doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False
+    doc: Doc, top_n: int = 25, split_chunks: bool = False
 ) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
@@ -114,10 +114,7 @@ def extract_keywords(
     remaining_slots = max(0, top_n - len(top_keywords))
     top_keywords += [kw for kw, _ in freq.most_common(remaining_slots)]
 
-    if clean:
-        return [clean_keyword(kw) for kw in top_keywords]
-    return top_keywords
-
+    return [clean_keyword(kw) for kw in top_keywords]
 
 @lru_cache(maxsize=1000)
 def text_to_tsvector_query(

From 9df8de412c4cb0cc2a23c8e123f92acafacb0d5b Mon Sep 17 00:00:00 2001
From: Ahmad-mtos <Ahmad-mtos@users.noreply.github.com>
Date: Wed, 15 Jan 2025 09:04:47 +0000
Subject: [PATCH 25/27] refactor: Lint agents-api (CI)

---
 agents-api/agents_api/common/nlp.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index aea2380d8..01f6ee7e1 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -41,9 +41,7 @@ def clean_keyword(kw: str) -> str:
     return WHITESPACE_RE.sub(" ", cleaned).strip()
 
 
-def extract_keywords(
-    doc: Doc, top_n: int = 25, split_chunks: bool = False
-) -> list[str]:
+def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
         "DATE",  # Absolute or relative dates or periods.
@@ -116,6 +114,7 @@ def extract_keywords(
 
     return [clean_keyword(kw) for kw in top_keywords]
 
+
 @lru_cache(maxsize=1000)
 def text_to_tsvector_query(
     paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False

From 8fe87cbab6fca684f0b17cafd96e4207ea13061a Mon Sep 17 00:00:00 2001
From: Ahmad Haidar <ahmad.haidar.syr@gmail.com>
Date: Wed, 15 Jan 2025 12:27:46 +0300
Subject: [PATCH 26/27] fix(agents-api): increase test coverage + set
 ``split_cuncks=Ture`` as default

---
 agents-api/agents_api/common/nlp.py            |  4 ++--
 .../queries/docs/search_docs_by_text.py        |  2 +-
 .../queries/docs/search_docs_hybrid.py         |  2 +-
 agents-api/tests/test_nlp_utilities.py         | 18 ++++++++++++++++++
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py
index 01f6ee7e1..c89339ae2 100644
--- a/agents-api/agents_api/common/nlp.py
+++ b/agents-api/agents_api/common/nlp.py
@@ -41,7 +41,7 @@ def clean_keyword(kw: str) -> str:
     return WHITESPACE_RE.sub(" ", cleaned).strip()
 
 
-def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> list[str]:
+def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = True) -> list[str]:
     """Optimized keyword extraction with minimal behavior change."""
     excluded_labels = {
         "DATE",  # Absolute or relative dates or periods.
@@ -117,7 +117,7 @@ def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> l
 
 @lru_cache(maxsize=1000)
 def text_to_tsvector_query(
-    paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False
+    paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = True
 ) -> str:
     """
     Extracts meaningful keywords/phrases from text and joins them with OR.
diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py
index b98906466..6632d3162 100644
--- a/agents-api/agents_api/queries/docs/search_docs_by_text.py
+++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py
@@ -62,7 +62,7 @@ async def search_docs_by_text(
     owner_types: list[str] = [owner[0] for owner in owners]
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
     #  Pre-process rawtext query
-    query = text_to_tsvector_query(query)
+    query = text_to_tsvector_query(query, split_chunks=True)
 
     return (
         search_docs_text_query,
diff --git a/agents-api/agents_api/queries/docs/search_docs_hybrid.py b/agents-api/agents_api/queries/docs/search_docs_hybrid.py
index 1f7c363c4..6047069f8 100644
--- a/agents-api/agents_api/queries/docs/search_docs_hybrid.py
+++ b/agents-api/agents_api/queries/docs/search_docs_hybrid.py
@@ -83,7 +83,7 @@ async def search_docs_hybrid(
     owner_ids: list[str] = [str(owner[1]) for owner in owners]
 
     # Pre-process rawtext query
-    text_query = text_to_tsvector_query(text_query)
+    text_query = text_to_tsvector_query(text_query, split_chunks=True)
 
     return (
         search_docs_hybrid_query,
diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py
index 82c22911a..5677de7d9 100644
--- a/agents-api/tests/test_nlp_utilities.py
+++ b/agents-api/tests/test_nlp_utilities.py
@@ -80,6 +80,15 @@ async def _():
             "google OR john doe OR a software engineer",
         ),
         ("- google", "google"),
+        # Test duplicate keyword handling
+        (
+            "John Doe is great. John Doe is awesome.",
+            "john doe",  # Should only include "John Doe" once
+        ),
+        (
+            "Software Engineer at Google. Also, a Software Engineer.",
+            "Google OR Also a Software Engineer OR Software Engineer",  # Should only include "Software Engineer" once
+        ),
     ]
 
     for input_text, expected_output in test_cases:
@@ -141,6 +150,15 @@ async def _():
             "John Doe is a software engineer at Google.",
             "google OR john doe OR a OR software OR engineer",
         ),
+        # Test duplicate keyword handling
+        (
+            "John Doe is great. John Doe is awesome.",
+            "john doe",  # Should only include "John Doe" once even with split_chunks=True
+        ),
+        (
+            "Software Engineer at Google. Also, a Software Engineer.",
+            "Also OR a OR google OR software OR engineer",  # When split, each word appears once
+        ),
     ]
 
     for input_text, expected_output in test_cases:

From fcd2ad30db40e69e55aedfc585479ba00ef79770 Mon Sep 17 00:00:00 2001
From: Ahmad Haidar <ahmad.haidar.syr@gmail.com>
Date: Wed, 15 Jan 2025 12:31:22 +0300
Subject: [PATCH 27/27] tests hotfix

---
 agents-api/tests/test_nlp_utilities.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py
index 5677de7d9..733f695d5 100644
--- a/agents-api/tests/test_nlp_utilities.py
+++ b/agents-api/tests/test_nlp_utilities.py
@@ -26,11 +26,28 @@ async def _():
     assert clean_keyword("- try") == "try"
 
 
-@test("utility: extract_keywords")
+@test("utility: extract_keywords - split_chunks=False")
 async def _():
     nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"])
     doc = nlp("John Doe is a software engineer at Google.")
-    assert set(extract_keywords(doc)) == {"John Doe", "a software engineer", "Google"}
+    assert set(extract_keywords(doc, split_chunks=False)) == {
+        "John Doe",
+        "a software engineer",
+        "Google",
+    }
+
+
+@test("utility: extract_keywords - split_chunks=True")
+async def _():
+    nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"])
+    doc = nlp("John Doe is a software engineer at Google.")
+    assert set(extract_keywords(doc, split_chunks=True)) == {
+        "John Doe",
+        "a",
+        "software",
+        "engineer",
+        "Google",
+    }
 
 
 @test("utility: text_to_tsvector_query - split_chunks=False")