From e466dfea4ebca6addf487ffa60747cb4649f5e05 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Sun, 12 Jan 2025 15:25:48 -0500 Subject: [PATCH 01/27] fix(agetns-api): init nlp pipeline text-search --- agents-api/agents_api/common/nlp.py | 100 ++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 00ba3d881..502726259 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -294,3 +294,103 @@ def batch_paragraphs_to_custom_queries( results.append(queries) return results + +@lru_cache(maxsize=1000) +def text_to_tsvector_query(text: str, top_n: int = 10) -> str: + """ + Converts text into a PostgreSQL tsquery format using sophisticated NLP processing. + Cached for repeated queries. + + Args: + text (str): Input text to convert + top_n (int): Number of top keywords to include + + Returns: + str: PostgreSQL tsquery compatible string + """ + if not text or not text.strip(): + return "" + + # Process text with spaCy + doc = nlp(text) + + # Extract important keywords using existing extract_keywords function + keywords = extract_keywords(doc, top_n=top_n, clean=True) + + if not keywords: + return "" + + # Find keyword positions using existing matcher + keyword_positions = keyword_matcher.find_matches(doc, keywords) + + if not keyword_positions: + return "" + + # Find proximity groups + groups = find_proximity_groups(keywords, keyword_positions, n=10) + + # Convert groups to tsquery format + tsquery_parts = [] + + for group in groups: + if len(group) == 1: + # Single keyword + tsquery_parts.append(next(iter(group))) + else: + # For multiple keywords in proximity, use <-> operator in PostgreSQL + sorted_group = sorted(group, key=len, reverse=True) + tsquery_parts.append( + "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")" + ) + + return " | ".join(tsquery_parts) + +def batch_text_to_tsvector_queries( + paragraphs: list[str], # Changed to list since we don't need tuple for caching + top_n: int = 10, +) -> list[str]: + """ + Process multiple paragraphs into tsquery format efficiently. + + Args: + paragraphs (list[str]): List of paragraphs to process + top_n (int): Number of top keywords to include per paragraph + + Returns: + list[str]: List of tsquery strings + """ + results = [] + + # Use spaCy's pipe for efficient batch processing + docs = nlp.pipe(paragraphs) + + for doc in docs: + # Process each paragraph + keywords = extract_keywords(doc, top_n=top_n, clean=True) + + if not keywords: + results.append("") + continue + + keyword_positions = keyword_matcher.find_matches(doc, keywords) + + if not keyword_positions: + results.append("") + continue + + groups = find_proximity_groups(keywords, keyword_positions, n=10) + + # Build tsquery for this paragraph + tsquery_parts = [] + for group in groups: + if len(group) == 1: + tsquery_parts.append(next(iter(group))) + else: + sorted_group = sorted(group, key=len, reverse=True) + tsquery_parts.append( + "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")" + ) + + results.append(" | ".join(tsquery_parts)) + + return results From ba39b547c87b991899900caff528b5ec512cb4d4 Mon Sep 17 00:00:00 2001 From: Vedantsahai18 Date: Sun, 12 Jan 2025 20:26:47 +0000 Subject: [PATCH 02/27] refactor: Lint agents-api (CI) --- agents-api/agents_api/common/nlp.py | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 502726259..250eec6c5 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -295,43 +295,44 @@ def batch_paragraphs_to_custom_queries( return results + @lru_cache(maxsize=1000) def text_to_tsvector_query(text: str, top_n: int = 10) -> str: """ Converts text into a PostgreSQL tsquery format using sophisticated NLP processing. Cached for repeated queries. - + Args: text (str): Input text to convert top_n (int): Number of top keywords to include - + Returns: str: PostgreSQL tsquery compatible string """ if not text or not text.strip(): return "" - + # Process text with spaCy doc = nlp(text) - + # Extract important keywords using existing extract_keywords function keywords = extract_keywords(doc, top_n=top_n, clean=True) - + if not keywords: return "" - + # Find keyword positions using existing matcher keyword_positions = keyword_matcher.find_matches(doc, keywords) - + if not keyword_positions: return "" - + # Find proximity groups groups = find_proximity_groups(keywords, keyword_positions, n=10) - + # Convert groups to tsquery format tsquery_parts = [] - + for group in groups: if len(group) == 1: # Single keyword @@ -339,23 +340,22 @@ def text_to_tsvector_query(text: str, top_n: int = 10) -> str: else: # For multiple keywords in proximity, use <-> operator in PostgreSQL sorted_group = sorted(group, key=len, reverse=True) - tsquery_parts.append( - "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")" - ) - + tsquery_parts.append("(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")") + return " | ".join(tsquery_parts) + def batch_text_to_tsvector_queries( paragraphs: list[str], # Changed to list since we don't need tuple for caching top_n: int = 10, ) -> list[str]: """ Process multiple paragraphs into tsquery format efficiently. - + Args: paragraphs (list[str]): List of paragraphs to process top_n (int): Number of top keywords to include per paragraph - + Returns: list[str]: List of tsquery strings """ From 8c3d6be2139bb0b958f3a5ac814c08c942901b6d Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Sun, 12 Jan 2025 23:21:21 -0500 Subject: [PATCH 03/27] chore: misc update --- agents-api/agents_api/common/nlp.py | 284 +++++++++++++++------------- 1 file changed, 155 insertions(+), 129 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 250eec6c5..8a640a535 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -180,35 +180,162 @@ def union(u: str, v: str) -> None: return list(groups.values()) -def build_query_pattern(group_size: int, n: int) -> str: - """Cache query patterns for common group sizes.""" - if group_size == 1: - return '"{}"' - return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")" +# def build_query_pattern(group_size: int, n: int) -> str: +# """Cache query patterns for common group sizes.""" +# if group_size == 1: +# return '"{}"' +# return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")" + + +# def build_query(groups: list[set[str]], n: int = 10) -> str: +# """Build query with cached patterns.""" +# clauses = [] + +# for group in groups: +# if len(group) == 1: +# clauses.append(f'"{next(iter(group))}"') +# else: +# # Sort by length descending to prioritize longer phrases +# sorted_group = sorted(group, key=len, reverse=True) +# # Get cached pattern and format with keywords +# pattern = build_query_pattern(len(group), n) +# clause = pattern.format(*sorted_group) +# clauses.append(clause) + +# return " OR ".join(clauses) + + +# @lru_cache(maxsize=100) +# def paragraph_to_custom_queries( +# paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1 +# ) -> list[str]: +# """ +# Optimized paragraph processing with minimal behavior changes. +# Added min_keywords parameter to filter out low-value queries. + +# Args: +# paragraph (str): The input paragraph to convert. +# top_n (int): Number of top keywords to extract per sentence. +# proximity_n (int): The proximity window for NEAR/n. +# min_keywords (int): Minimum number of keywords required to form a query. + +# Returns: +# list[str]: The list of custom query strings. +# """ +# if not paragraph or not paragraph.strip(): +# return [] + +# # Process entire paragraph once +# doc = nlp(paragraph) +# queries = [] + +# # Process sentences +# for sent in doc.sents: +# # Convert to doc for consistent API +# sent_doc = sent.as_doc() + +# # Extract and clean keywords +# keywords = extract_keywords(sent_doc, top_n) +# if len(keywords) < min_keywords: +# continue + +# # Find keyword positions using matcher +# keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) + +# # Skip if no keywords found in positions +# if not keyword_positions: +# continue + +# # Find proximity groups and build query +# groups = find_proximity_groups(keywords, keyword_positions, proximity_n) +# query = build_query(groups, proximity_n) + +# if query: +# queries.append(query) + +# return queries + + +# def batch_paragraphs_to_custom_queries( +# paragraphs: list[str], +# top_n: int = 10, +# proximity_n: int = 10, +# min_keywords: int = 1, +# n_process: int = 1, +# ) -> list[list[str]]: +# """ +# Processes multiple paragraphs using nlp.pipe for better performance. + +# Args: +# paragraphs (list[str]): list of paragraphs to process. +# top_n (int): Number of top keywords to extract per sentence. +# proximity_n (int): The proximity window for NEAR/n. +# min_keywords (int): Minimum number of keywords required to form a query. +# n_process (int): Number of processes to use for multiprocessing. + +# Returns: +# list[list[str]]: A list where each element is a list of queries for a paragraph. +# """ +# results = [] +# for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): +# queries = [] +# for sent in doc.sents: +# sent_doc = sent.as_doc() +# keywords = extract_keywords(sent_doc, top_n) +# if len(keywords) < min_keywords: +# continue +# keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) +# if not keyword_positions: +# continue +# groups = find_proximity_groups(keywords, keyword_positions, proximity_n) +# query = build_query(groups, proximity_n) +# if query: +# queries.append(query) +# results.append(queries) + +# return results + + +def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str: + """ + Builds a PostgreSQL tsquery string from groups of keywords. + Args: + groups (list[set[str]]): List of keyword groups + proximity_n (int): Maximum distance between words for proximity search -def build_query(groups: list[set[str]], n: int = 10) -> str: - """Build query with cached patterns.""" - clauses = [] + Returns: + str: PostgreSQL tsquery compatible string + """ + if not groups: + return "" + + query_parts = [] for group in groups: + if not group: # Skip empty groups + continue + if len(group) == 1: - clauses.append(f'"{next(iter(group))}"') + # Single word - just wrap in quotes + word = next(iter(group)) + # No need to check for stopwords since they should be filtered earlier + query_parts.append(f"'{word.lower()}'") else: - # Sort by length descending to prioritize longer phrases - sorted_group = sorted(group, key=len, reverse=True) - # Get cached pattern and format with keywords - pattern = build_query_pattern(len(group), n) - clause = pattern.format(*sorted_group) - clauses.append(clause) + # Multiple words - sort by length (descending) and connect with <-> + sorted_words = sorted(group, key=len, reverse=True) + filtered_words = [word.lower() for word in sorted_words] + if filtered_words: + phrase = " <-> ".join(f"'{word}'" for word in filtered_words) + query_parts.append(f"({phrase})") - return " OR ".join(clauses) + return " & ".join(query_parts) if query_parts else "" -@lru_cache(maxsize=100) -def paragraph_to_custom_queries( +@lru_cache(maxsize=1000) +def text_to_tsvector_query( paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1 -) -> list[str]: +) -> str: """ Optimized paragraph processing with minimal behavior changes. Added min_keywords parameter to filter out low-value queries. @@ -219,8 +346,9 @@ def paragraph_to_custom_queries( proximity_n (int): The proximity window for NEAR/n. min_keywords (int): Minimum number of keywords required to form a query. + Returns: - list[str]: The list of custom query strings. + str: PostgreSQL tsquery compatible string """ if not paragraph or not paragraph.strip(): return [] @@ -248,7 +376,7 @@ def paragraph_to_custom_queries( # Find proximity groups and build query groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - query = build_query(groups, proximity_n) + query = build_ts_query(groups, proximity_n) if query: queries.append(query) @@ -256,7 +384,7 @@ def paragraph_to_custom_queries( return queries -def batch_paragraphs_to_custom_queries( +def batch_text_to_tsvector_queries( paragraphs: list[str], top_n: int = 10, proximity_n: int = 10, @@ -267,16 +395,14 @@ def batch_paragraphs_to_custom_queries( Processes multiple paragraphs using nlp.pipe for better performance. Args: - paragraphs (list[str]): list of paragraphs to process. - top_n (int): Number of top keywords to extract per sentence. - proximity_n (int): The proximity window for NEAR/n. - min_keywords (int): Minimum number of keywords required to form a query. - n_process (int): Number of processes to use for multiprocessing. + paragraphs (list[str]): List of paragraphs to process + top_n (int): Number of top keywords to include per paragraph Returns: - list[list[str]]: A list where each element is a list of queries for a paragraph. + list[str]: List of tsquery strings """ results = [] + for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): queries = [] for sent in doc.sents: @@ -288,109 +414,9 @@ def batch_paragraphs_to_custom_queries( if not keyword_positions: continue groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - query = build_query(groups, proximity_n) + query = build_ts_query(groups, proximity_n) if query: queries.append(query) results.append(queries) return results - - -@lru_cache(maxsize=1000) -def text_to_tsvector_query(text: str, top_n: int = 10) -> str: - """ - Converts text into a PostgreSQL tsquery format using sophisticated NLP processing. - Cached for repeated queries. - - Args: - text (str): Input text to convert - top_n (int): Number of top keywords to include - - Returns: - str: PostgreSQL tsquery compatible string - """ - if not text or not text.strip(): - return "" - - # Process text with spaCy - doc = nlp(text) - - # Extract important keywords using existing extract_keywords function - keywords = extract_keywords(doc, top_n=top_n, clean=True) - - if not keywords: - return "" - - # Find keyword positions using existing matcher - keyword_positions = keyword_matcher.find_matches(doc, keywords) - - if not keyword_positions: - return "" - - # Find proximity groups - groups = find_proximity_groups(keywords, keyword_positions, n=10) - - # Convert groups to tsquery format - tsquery_parts = [] - - for group in groups: - if len(group) == 1: - # Single keyword - tsquery_parts.append(next(iter(group))) - else: - # For multiple keywords in proximity, use <-> operator in PostgreSQL - sorted_group = sorted(group, key=len, reverse=True) - tsquery_parts.append("(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")") - - return " | ".join(tsquery_parts) - - -def batch_text_to_tsvector_queries( - paragraphs: list[str], # Changed to list since we don't need tuple for caching - top_n: int = 10, -) -> list[str]: - """ - Process multiple paragraphs into tsquery format efficiently. - - Args: - paragraphs (list[str]): List of paragraphs to process - top_n (int): Number of top keywords to include per paragraph - - Returns: - list[str]: List of tsquery strings - """ - results = [] - - # Use spaCy's pipe for efficient batch processing - docs = nlp.pipe(paragraphs) - - for doc in docs: - # Process each paragraph - keywords = extract_keywords(doc, top_n=top_n, clean=True) - - if not keywords: - results.append("") - continue - - keyword_positions = keyword_matcher.find_matches(doc, keywords) - - if not keyword_positions: - results.append("") - continue - - groups = find_proximity_groups(keywords, keyword_positions, n=10) - - # Build tsquery for this paragraph - tsquery_parts = [] - for group in groups: - if len(group) == 1: - tsquery_parts.append(next(iter(group))) - else: - sorted_group = sorted(group, key=len, reverse=True) - tsquery_parts.append( - "(" + " <-> ".join(f"'{word}'" for word in sorted_group) + ")" - ) - - results.append(" | ".join(tsquery_parts)) - - return results From 1d677a2589f079543c93ce4c46576b381917b9c4 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Sun, 12 Jan 2025 23:21:45 -0500 Subject: [PATCH 04/27] feat(test): add new embeddings + FTS tests --- agents-api/tests/fixtures.py | 11 + agents-api/tests/test_docs_queries.py | 800 +++++++++++++++++--------- 2 files changed, 553 insertions(+), 258 deletions(-) diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py index b14078d68..166bbef73 100644 --- a/agents-api/tests/fixtures.py +++ b/agents-api/tests/fixtures.py @@ -175,6 +175,17 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test f"[{', '.join([str(x) for x in [1.0] * 1024])}]", ) + await pool.execute( + """ + INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) + VALUES ($1, $2, 0, 1, $3, $4) + """, # Changed chunk_seq from 0 to 1 + developer.id, + doc.id, + "Different test content", + f"[{', '.join([str(x) for x in [0.5] * 1024])}]", + ) + yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool) diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index 6690badfd..70f13a129 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -1,260 +1,251 @@ -from agents_api.autogen.openapi_model import CreateDocRequest from agents_api.clients.pg import create_db_pool -from agents_api.queries.docs.create_doc import create_doc -from agents_api.queries.docs.delete_doc import delete_doc -from agents_api.queries.docs.get_doc import get_doc -from agents_api.queries.docs.list_docs import list_docs from agents_api.queries.docs.search_docs_by_embedding import search_docs_by_embedding -from agents_api.queries.docs.search_docs_by_text import search_docs_by_text -from agents_api.queries.docs.search_docs_hybrid import search_docs_hybrid from ward import test from .fixtures import ( pg_dsn, test_agent, test_developer, - test_doc, test_doc_with_embedding, - test_user, ) EMBEDDING_SIZE: int = 1024 -@test("query: create user doc") -async def _(dsn=pg_dsn, developer=test_developer, user=test_user): - pool = await create_db_pool(dsn=dsn) - doc_created = await create_doc( - developer_id=developer.id, - data=CreateDocRequest( - title="User Doc", - content=["Docs for user testing", "Docs for user testing 2"], - metadata={"test": "test"}, - embed_instruction="Embed the document", - ), - owner_type="user", - owner_id=user.id, - connection_pool=pool, - ) - - assert doc_created.id is not None - - # Verify doc appears in user's docs - found = await get_doc( - developer_id=developer.id, - doc_id=doc_created.id, - connection_pool=pool, - ) - assert found.id == doc_created.id - - -@test("query: create agent doc") -async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): - pool = await create_db_pool(dsn=dsn) - doc = await create_doc( - developer_id=developer.id, - data=CreateDocRequest( - title="Agent Doc", - content="Docs for agent testing", - metadata={"test": "test"}, - embed_instruction="Embed the document", - ), - owner_type="agent", - owner_id=agent.id, - connection_pool=pool, - ) - assert doc.id is not None - - # Verify doc appears in agent's docs - docs_list = await list_docs( - developer_id=developer.id, - owner_type="agent", - owner_id=agent.id, - connection_pool=pool, - ) - assert any(d.id == doc.id for d in docs_list) - - -@test("query: get doc") -async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc): - pool = await create_db_pool(dsn=dsn) - doc_test = await get_doc( - developer_id=developer.id, - doc_id=doc.id, - connection_pool=pool, - ) - assert doc_test.id == doc.id - assert doc_test.title is not None - assert doc_test.content is not None - - -@test("query: list user docs") -async def _(dsn=pg_dsn, developer=test_developer, user=test_user): - pool = await create_db_pool(dsn=dsn) - - # Create a doc owned by the user - doc_user = await create_doc( - developer_id=developer.id, - data=CreateDocRequest( - title="User List Test", - content="Some user doc content", - metadata={"test": "test"}, - embed_instruction="Embed the document", - ), - owner_type="user", - owner_id=user.id, - connection_pool=pool, - ) - - # List user's docs - docs_list = await list_docs( - developer_id=developer.id, - owner_type="user", - owner_id=user.id, - connection_pool=pool, - ) - assert len(docs_list) >= 1 - assert any(d.id == doc_user.id for d in docs_list) - - -@test("query: list agent docs") -async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): - pool = await create_db_pool(dsn=dsn) - - # Create a doc owned by the agent - doc_agent = await create_doc( - developer_id=developer.id, - data=CreateDocRequest( - title="Agent List Test", - content="Some agent doc content", - metadata={"test": "test"}, - embed_instruction="Embed the document", - ), - owner_type="agent", - owner_id=agent.id, - connection_pool=pool, - ) - - # List agent's docs - docs_list = await list_docs( - developer_id=developer.id, - owner_type="agent", - owner_id=agent.id, - connection_pool=pool, - ) - assert len(docs_list) >= 1 - assert any(d.id == doc_agent.id for d in docs_list) - - -@test("query: delete user doc") -async def _(dsn=pg_dsn, developer=test_developer, user=test_user): - pool = await create_db_pool(dsn=dsn) - - # Create a doc owned by the user - doc_user = await create_doc( - developer_id=developer.id, - data=CreateDocRequest( - title="User Delete Test", - content="Doc for user deletion test", - metadata={"test": "test"}, - embed_instruction="Embed the document", - ), - owner_type="user", - owner_id=user.id, - connection_pool=pool, - ) - - # Delete the doc - await delete_doc( - developer_id=developer.id, - doc_id=doc_user.id, - owner_type="user", - owner_id=user.id, - connection_pool=pool, - ) - - # Verify doc is no longer in user's docs - docs_list = await list_docs( - developer_id=developer.id, - owner_type="user", - owner_id=user.id, - connection_pool=pool, - ) - assert not any(d.id == doc_user.id for d in docs_list) - - -@test("query: delete agent doc") -async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): - pool = await create_db_pool(dsn=dsn) - - # Create a doc owned by the agent - doc_agent = await create_doc( - developer_id=developer.id, - data=CreateDocRequest( - title="Agent Delete Test", - content="Doc for agent deletion test", - metadata={"test": "test"}, - embed_instruction="Embed the document", - ), - owner_type="agent", - owner_id=agent.id, - connection_pool=pool, - ) - - # Delete the doc - await delete_doc( - developer_id=developer.id, - doc_id=doc_agent.id, - owner_type="agent", - owner_id=agent.id, - connection_pool=pool, - ) - - # Verify doc is no longer in agent's docs - docs_list = await list_docs( - developer_id=developer.id, - owner_type="agent", - owner_id=agent.id, - connection_pool=pool, - ) - assert not any(d.id == doc_agent.id for d in docs_list) - - -@test("query: search docs by text") -async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer): - pool = await create_db_pool(dsn=dsn) - - # Create a test document - doc = await create_doc( - developer_id=developer.id, - owner_type="agent", - owner_id=agent.id, - data=CreateDocRequest( - title="Hello", - content="The world is a funny little thing", - metadata={"test": "test"}, - embed_instruction="Embed the document", - ), - connection_pool=pool, - ) - - # Search using simpler terms first - result = await search_docs_by_text( - developer_id=developer.id, - owners=[("agent", agent.id)], - query="world", - k=3, - search_language="english", - metadata_filter={"test": "test"}, - connection_pool=pool, - ) - - print("\nSearch results:", result) - - # More specific assertions - assert len(result) >= 1, "Should find at least one document" - assert any(d.id == doc.id for d in result), f"Should find document {doc.id}" - assert result[0].metadata == {"test": "test"}, "Metadata should match" +# @test("query: create user doc") +# async def _(dsn=pg_dsn, developer=test_developer, user=test_user): +# pool = await create_db_pool(dsn=dsn) +# doc_created = await create_doc( +# developer_id=developer.id, +# data=CreateDocRequest( +# title="User Doc", +# content=["Docs for user testing", "Docs for user testing 2"], +# metadata={"test": "test"}, +# embed_instruction="Embed the document", +# ), +# owner_type="user", +# owner_id=user.id, +# connection_pool=pool, +# ) + +# assert doc_created.id is not None + +# # Verify doc appears in user's docs +# found = await get_doc( +# developer_id=developer.id, +# doc_id=doc_created.id, +# connection_pool=pool, +# ) +# assert found.id == doc_created.id + + +# @test("query: create agent doc") +# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): +# pool = await create_db_pool(dsn=dsn) +# doc = await create_doc( +# developer_id=developer.id, +# data=CreateDocRequest( +# title="Agent Doc", +# content="Docs for agent testing", +# metadata={"test": "test"}, +# embed_instruction="Embed the document", +# ), +# owner_type="agent", +# owner_id=agent.id, +# connection_pool=pool, +# ) +# assert doc.id is not None + +# # Verify doc appears in agent's docs +# docs_list = await list_docs( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# connection_pool=pool, +# ) +# assert any(d.id == doc.id for d in docs_list) + + +# @test("query: get doc") +# async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc): +# pool = await create_db_pool(dsn=dsn) +# doc_test = await get_doc( +# developer_id=developer.id, +# doc_id=doc.id, +# connection_pool=pool, +# ) +# assert doc_test.id == doc.id +# assert doc_test.title is not None +# assert doc_test.content is not None + + +# @test("query: list user docs") +# async def _(dsn=pg_dsn, developer=test_developer, user=test_user): +# pool = await create_db_pool(dsn=dsn) + +# # Create a doc owned by the user +# doc_user = await create_doc( +# developer_id=developer.id, +# data=CreateDocRequest( +# title="User List Test", +# content="Some user doc content", +# metadata={"test": "test"}, +# embed_instruction="Embed the document", +# ), +# owner_type="user", +# owner_id=user.id, +# connection_pool=pool, +# ) + +# # List user's docs +# docs_list = await list_docs( +# developer_id=developer.id, +# owner_type="user", +# owner_id=user.id, +# connection_pool=pool, +# ) +# assert len(docs_list) >= 1 +# assert any(d.id == doc_user.id for d in docs_list) + + +# @test("query: list agent docs") +# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): +# pool = await create_db_pool(dsn=dsn) + +# # Create a doc owned by the agent +# doc_agent = await create_doc( +# developer_id=developer.id, +# data=CreateDocRequest( +# title="Agent List Test", +# content="Some agent doc content", +# metadata={"test": "test"}, +# embed_instruction="Embed the document", +# ), +# owner_type="agent", +# owner_id=agent.id, +# connection_pool=pool, +# ) + +# # List agent's docs +# docs_list = await list_docs( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# connection_pool=pool, +# ) +# assert len(docs_list) >= 1 +# assert any(d.id == doc_agent.id for d in docs_list) + + +# @test("query: delete user doc") +# async def _(dsn=pg_dsn, developer=test_developer, user=test_user): +# pool = await create_db_pool(dsn=dsn) + +# # Create a doc owned by the user +# doc_user = await create_doc( +# developer_id=developer.id, +# data=CreateDocRequest( +# title="User Delete Test", +# content="Doc for user deletion test", +# metadata={"test": "test"}, +# embed_instruction="Embed the document", +# ), +# owner_type="user", +# owner_id=user.id, +# connection_pool=pool, +# ) + +# # Delete the doc +# await delete_doc( +# developer_id=developer.id, +# doc_id=doc_user.id, +# owner_type="user", +# owner_id=user.id, +# connection_pool=pool, +# ) + +# # Verify doc is no longer in user's docs +# docs_list = await list_docs( +# developer_id=developer.id, +# owner_type="user", +# owner_id=user.id, +# connection_pool=pool, +# ) +# assert not any(d.id == doc_user.id for d in docs_list) + + +# @test("query: delete agent doc") +# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): +# pool = await create_db_pool(dsn=dsn) + +# # Create a doc owned by the agent +# doc_agent = await create_doc( +# developer_id=developer.id, +# data=CreateDocRequest( +# title="Agent Delete Test", +# content="Doc for agent deletion test", +# metadata={"test": "test"}, +# embed_instruction="Embed the document", +# ), +# owner_type="agent", +# owner_id=agent.id, +# connection_pool=pool, +# ) + +# # Delete the doc +# await delete_doc( +# developer_id=developer.id, +# doc_id=doc_agent.id, +# owner_type="agent", +# owner_id=agent.id, +# connection_pool=pool, +# ) + +# # Verify doc is no longer in agent's docs +# docs_list = await list_docs( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# connection_pool=pool, +# ) +# assert not any(d.id == doc_agent.id for d in docs_list) + + +# @test("query: search docs by text") +# async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer): +# pool = await create_db_pool(dsn=dsn) + +# # Create a test document +# doc = await create_doc( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# data=CreateDocRequest( +# title="Hello", +# content="The world is a funny little thing", +# metadata={"test": "test"}, +# embed_instruction="Embed the document", +# ), +# connection_pool=pool, +# ) + +# # Search using simpler terms first +# result = await search_docs_by_text( +# developer_id=developer.id, +# owners=[("agent", agent.id)], +# query="world", +# k=3, +# search_language="english", +# metadata_filter={"test": "test"}, +# connection_pool=pool, +# ) + +# print("\nSearch results:", result) + +# # More specific assertions +# assert len(result) >= 1, "Should find at least one document" +# assert any(d.id == doc.id for d in result), f"Should find document {doc.id}" +# assert result[0].metadata == {"test": "test"}, "Metadata should match" @test("query: search docs by embedding") @@ -282,25 +273,318 @@ async def _( assert result[0].metadata is not None -@test("query: search docs by hybrid") +# @test("query: search docs by hybrid") +# async def _( +# dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding +# ): +# pool = await create_db_pool(dsn=dsn) + +# # Get query embedding by averaging the embeddings (list of floats) +# query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)] + +# # Search using the correct parameter types +# result = await search_docs_hybrid( +# developer_id=developer.id, +# owners=[("agent", agent.id)], +# text_query=doc.content[0] if isinstance(doc.content, list) else doc.content, +# embedding=query_embedding, +# k=3, # Add k parameter +# metadata_filter={"test": "test"}, # Add metadata filter +# connection_pool=pool, +# ) + +# assert len(result) >= 1 +# assert result[0].metadata is not None + + +# @test("query: test tsvector with technical terms and phrases") +# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): +# pool = await create_db_pool(dsn=dsn) + +# # Create documents with technical content +# doc1 = await create_doc( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# data=CreateDocRequest( +# title="Technical Document", +# content="API endpoints using REST architecture with JSON payloads", +# metadata={"domain": "technical"}, +# embed_instruction="Embed the document", +# ), +# connection_pool=pool, +# ) + +# doc2 = await create_doc( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# data=CreateDocRequest( +# title="More Technical Terms", +# content="Database optimization using indexing and query planning", +# metadata={"domain": "technical"}, +# embed_instruction="Embed the document", +# ), +# connection_pool=pool, +# ) + +# # Test with technical terms +# technical_queries = [ +# "API endpoints", +# "REST architecture", +# "database optimization", +# "indexing" +# ] + +# for query in technical_queries: +# results = await search_docs_by_text( +# developer_id=developer.id, +# owners=[("agent", agent.id)], +# query=query, +# k=3, +# search_language="english", +# connection_pool=pool, +# ) + +# print(f"\nSearch results for '{query}':", results) + +# # Verify appropriate document is found based on query +# if "API" in query or "REST" in query: +# assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'" +# if "database" in query.lower() or "indexing" in query: +# assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'" + +# @test("query: test tsvector with varying content lengths and special characters") +# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): +# pool = await create_db_pool(dsn=dsn) + +# # Create documents with different content lengths +# short_doc = await create_doc( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# data=CreateDocRequest( +# title="Short", +# content="Brief test document", +# metadata={"length": "short"}, +# embed_instruction="Embed the document", +# ), +# connection_pool=pool, +# ) + +# medium_doc = await create_doc( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# data=CreateDocRequest( +# title="Medium", +# content="This is a medium length document that contains more words and context for testing purposes", +# metadata={"length": "medium"}, +# embed_instruction="Embed the document", +# ), +# connection_pool=pool, +# ) + +# long_doc = await create_doc( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# data=CreateDocRequest( +# title="Long", +# content="This is a much longer document that contains multiple sentences. It includes various terms and phrases. \ +# The purpose is to test how the search handles longer content with more context. \ +# It should still be able to find relevant matches based on the search query.", +# metadata={"length": "long"}, +# embed_instruction="Embed the document", +# ), +# connection_pool=pool, +# ) + +# special_doc = await create_doc( +# developer_id=developer.id, +# owner_type="agent", +# owner_id=agent.id, +# data=CreateDocRequest( +# title="Special Characters", +# content="Testing! With? Different... punctuation; marks: and-hyphens, plus+signs & ampersands", +# metadata={"type": "special"}, +# embed_instruction="Embed the document", +# ), +# connection_pool=pool, +# ) + +# # Test cases for different content lengths +# length_test_cases = [ +# ("brief test", short_doc.id), +# ("medium length document", medium_doc.id), +# ("multiple sentences", long_doc.id), +# ("document", None) # Should find all documents +# ] + +# for query, expected_doc_id in length_test_cases: +# results = await search_docs_by_text( +# developer_id=developer.id, +# owners=[("agent", agent.id)], +# query=query, +# k=3, +# search_language="english", +# connection_pool=pool, +# ) + +# print(f"\nSearch results for '{query}':", results) + +# if expected_doc_id: +# assert any(doc.id == expected_doc_id for doc in results), \ +# f"Expected document should be found with query '{query}'" +# else: +# # For general terms, verify multiple documents are found +# assert len(results) > 1, f"Multiple documents should be found with query '{query}'" + +# @test("query: test direct tsvector generation") +# async def _(): +# test_cases = [ +# # Single words +# ( +# "test", +# "'test'" +# ), +# ( +# "testing", +# "'testing'" +# ), + +# # Multiple words in single sentence +# ( +# "quick brown fox", +# "'quick' & 'brown' & 'fox'" +# ), +# ( +# "The Quick Brown Fox", +# "'quick' & 'brown' & 'fox'" +# ), + +# # Technical terms and phrases +# ( +# "machine learning algorithm", +# "('machine' <-> 'learning') & 'algorithm'" +# ), +# ( +# "REST API implementation", +# "'rest' & 'api' & 'implementation'" +# ), + +# # Multiple sentences +# ( +# "Machine learning is great. Data science rocks.", +# "('machine' <-> 'learning') & 'great' | ('data' <-> 'science') & 'rocks'" +# ), + +# # Quoted phrases +# ( +# '"quick brown fox"', +# "('quick' <-> 'brown' <-> 'fox')" +# ), +# ( +# 'Find "machine learning" algorithms', +# "('machine' <-> 'learning') & 'algorithms' & 'find'" +# ), + +# # Multiple quoted phrases +# ( +# '"data science" and "machine learning"', +# "('data' <-> 'science') & ('machine' <-> 'learning')" +# ), + +# # Edge cases +# ( +# "", +# "" +# ), +# ( +# "the and or", +# "" +# ), +# ( +# "a", +# "" +# ), +# ( +# "X", +# "'x'" +# ), + +# # Empty quotes +# ( +# '""', +# "" +# ), +# ( +# 'test "" phrase', +# "'test' & 'phrase'" +# ), +# ] + +# for input_text, expected_output in test_cases: +# result = text_to_tsvector_query(input_text) +# print(f"\nInput: '{input_text}'") +# print(f"Generated tsquery: '{result}'") +# print(f"Expected: '{expected_output}'") +# assert result == expected_output, \ +# f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" + + +@test("query: search docs by embedding with different confidence levels") async def _( dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding ): pool = await create_db_pool(dsn=dsn) - # Get query embedding by averaging the embeddings (list of floats) + # Create a test document with a different embedding + # different_embedding = [0.5] * EMBEDDING_SIZE # Create different embedding values + # await pool.execute( + # """ + # INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) + # VALUES ($1, $2, 0, 1, $3, $4) + # """, # Changed chunk_seq from 0 to 1 + # developer.id, + # doc.id, + # "Different test content", + # f"[{', '.join([str(x) for x in different_embedding])}]", + # ) + + # Get query embedding (using original doc's embedding) query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)] - # Search using the correct parameter types - result = await search_docs_hybrid( - developer_id=developer.id, - owners=[("agent", agent.id)], - text_query=doc.content[0] if isinstance(doc.content, list) else doc.content, - embedding=query_embedding, - k=3, # Add k parameter - metadata_filter={"test": "test"}, # Add metadata filter - connection_pool=pool, - ) - - assert len(result) >= 1 - assert result[0].metadata is not None + # Test with different confidence levels + confidence_tests = [ + (0.99, 0), # High confidence should find no results + (0.7, 1), # Medium confidence should find some results + (0.5, 2), # Lower confidence should find more results + (0.1, 2), # Very low confidence should find all results + ] + + for confidence, expected_min_results in confidence_tests: + results = await search_docs_by_embedding( + developer_id=developer.id, + owners=[("agent", agent.id)], + embedding=query_embedding, + k=3, + confidence=confidence, + metadata_filter={"test": "test"}, + connection_pool=pool, + ) + + print(f"\nSearch results with confidence {confidence}:") + for r in results: + print(f"- Doc ID: {r.id}, Distance: {r.distance}") + + assert len(results) >= expected_min_results, ( + f"Expected at least {expected_min_results} results with confidence {confidence}, got {len(results)}" + ) + + if results: + # Verify that all returned results meet the confidence threshold + for result in results: + assert result.distance >= confidence, ( + f"Result distance {result.distance} is below confidence threshold {confidence}" + ) From 67fc92d1356ae519ecb7d3d0f38cb3ddcfdf80be Mon Sep 17 00:00:00 2001 From: Dmitry Paramonov Date: Mon, 13 Jan 2025 13:03:05 +0300 Subject: [PATCH 05/27] fix: Remove unused function as the conversion is done by postgres query --- agents-api/agents_api/common/nlp.py | 68 ++++++++++++++--------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 8a640a535..09fc456a3 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -296,40 +296,40 @@ def union(u: str, v: str) -> None: # return results -def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str: - """ - Builds a PostgreSQL tsquery string from groups of keywords. +# def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str: +# """ +# Builds a PostgreSQL tsquery string from groups of keywords. - Args: - groups (list[set[str]]): List of keyword groups - proximity_n (int): Maximum distance between words for proximity search +# Args: +# groups (list[set[str]]): List of keyword groups +# proximity_n (int): Maximum distance between words for proximity search - Returns: - str: PostgreSQL tsquery compatible string - """ - if not groups: - return "" +# Returns: +# str: PostgreSQL tsquery compatible string +# """ +# if not groups: +# return "" - query_parts = [] +# query_parts = [] - for group in groups: - if not group: # Skip empty groups - continue +# for group in groups: +# if not group: # Skip empty groups +# continue - if len(group) == 1: - # Single word - just wrap in quotes - word = next(iter(group)) - # No need to check for stopwords since they should be filtered earlier - query_parts.append(f"'{word.lower()}'") - else: - # Multiple words - sort by length (descending) and connect with <-> - sorted_words = sorted(group, key=len, reverse=True) - filtered_words = [word.lower() for word in sorted_words] - if filtered_words: - phrase = " <-> ".join(f"'{word}'" for word in filtered_words) - query_parts.append(f"({phrase})") +# if len(group) == 1: +# # Single word - just wrap in quotes +# word = next(iter(group)) +# # No need to check for stopwords since they should be filtered earlier +# query_parts.append(f"'{word.lower()}'") +# else: +# # Multiple words - sort by length (descending) and connect with <-> +# sorted_words = sorted(group, key=len, reverse=True) +# filtered_words = [word.lower() for word in sorted_words] +# if filtered_words: +# phrase = " <-> ".join(f"'{word}'" for word in filtered_words) +# query_parts.append(f"({phrase})") - return " & ".join(query_parts) if query_parts else "" +# return " & ".join(query_parts) if query_parts else "" @lru_cache(maxsize=1000) @@ -376,10 +376,8 @@ def text_to_tsvector_query( # Find proximity groups and build query groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - query = build_ts_query(groups, proximity_n) - - if query: - queries.append(query) + if groups: + queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups])) return queries @@ -414,9 +412,9 @@ def batch_text_to_tsvector_queries( if not keyword_positions: continue groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - query = build_ts_query(groups, proximity_n) - if query: - queries.append(query) + if groups: + queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups])) + results.append(queries) return results From d21f9805deab853fa1cf51997c9f2a549a807e48 Mon Sep 17 00:00:00 2001 From: whiterabbit1983 Date: Mon, 13 Jan 2025 10:04:04 +0000 Subject: [PATCH 06/27] refactor: Lint agents-api (CI) --- agents-api/agents_api/common/nlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 09fc456a3..f33657dec 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -377,7 +377,7 @@ def text_to_tsvector_query( # Find proximity groups and build query groups = find_proximity_groups(keywords, keyword_positions, proximity_n) if groups: - queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups])) + queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups])) return queries @@ -413,7 +413,7 @@ def batch_text_to_tsvector_queries( continue groups = find_proximity_groups(keywords, keyword_positions, proximity_n) if groups: - queries.append(" AND ".join([f'({" OR ".join(grp)})' for grp in groups])) + queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups])) results.append(queries) From 8b19c96af0293142d79483adb7e8b8640deae889 Mon Sep 17 00:00:00 2001 From: Dmitry Paramonov Date: Mon, 13 Jan 2025 13:03:05 +0300 Subject: [PATCH 07/27] fix: Remove unused function as the conversion is done by postgres query --- agents-api/agents_api/common/nlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index f33657dec..705b58e79 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -377,7 +377,7 @@ def text_to_tsvector_query( # Find proximity groups and build query groups = find_proximity_groups(keywords, keyword_positions, proximity_n) if groups: - queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups])) + queries.append(" OR ".join([" OR ".join(grp) for grp in groups])) return queries @@ -413,7 +413,7 @@ def batch_text_to_tsvector_queries( continue groups = find_proximity_groups(keywords, keyword_positions, proximity_n) if groups: - queries.append(" AND ".join([f"({' OR '.join(grp)})" for grp in groups])) + queries.append(" OR ".join([" OR ".join(grp) for grp in groups])) results.append(queries) From 70a78b3be65a797de6814a3a648676ace6010413 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Mon, 13 Jan 2025 17:51:19 -0500 Subject: [PATCH 08/27] fix(agents-api): fixed nlp pipeline for FTS --- agents-api/agents_api/common/nlp.py | 236 +++++++--------------------- 1 file changed, 54 insertions(+), 182 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 705b58e79..233517aa8 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -94,14 +94,22 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str] # Extract and filter spans in a single pass ent_spans = [ent for ent in doc.ents if ent.label_ not in excluded_labels] - chunk_spans = [chunk for chunk in doc.noun_chunks if not chunk.root.is_stop] + # Add more comprehensive stopword filtering for noun chunks + chunk_spans = [ + chunk for chunk in doc.noun_chunks + if not chunk.root.is_stop and not all(token.is_stop for token in chunk) + ] all_spans = filter_spans(ent_spans + chunk_spans) - # Process spans efficiently + # Process spans efficiently and filter out spans that are entirely stopwords keywords = [] seen_texts = set() for span in all_spans: + # Skip if all tokens in span are stopwords + if all(token.is_stop for token in span): + continue + text = span.text.strip() lower_text = text.lower() @@ -180,206 +188,61 @@ def union(u: str, v: str) -> None: return list(groups.values()) -# def build_query_pattern(group_size: int, n: int) -> str: -# """Cache query patterns for common group sizes.""" -# if group_size == 1: -# return '"{}"' -# return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")" - - -# def build_query(groups: list[set[str]], n: int = 10) -> str: -# """Build query with cached patterns.""" -# clauses = [] - -# for group in groups: -# if len(group) == 1: -# clauses.append(f'"{next(iter(group))}"') -# else: -# # Sort by length descending to prioritize longer phrases -# sorted_group = sorted(group, key=len, reverse=True) -# # Get cached pattern and format with keywords -# pattern = build_query_pattern(len(group), n) -# clause = pattern.format(*sorted_group) -# clauses.append(clause) - -# return " OR ".join(clauses) - - -# @lru_cache(maxsize=100) -# def paragraph_to_custom_queries( -# paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1 -# ) -> list[str]: -# """ -# Optimized paragraph processing with minimal behavior changes. -# Added min_keywords parameter to filter out low-value queries. - -# Args: -# paragraph (str): The input paragraph to convert. -# top_n (int): Number of top keywords to extract per sentence. -# proximity_n (int): The proximity window for NEAR/n. -# min_keywords (int): Minimum number of keywords required to form a query. - -# Returns: -# list[str]: The list of custom query strings. -# """ -# if not paragraph or not paragraph.strip(): -# return [] - -# # Process entire paragraph once -# doc = nlp(paragraph) -# queries = [] - -# # Process sentences -# for sent in doc.sents: -# # Convert to doc for consistent API -# sent_doc = sent.as_doc() - -# # Extract and clean keywords -# keywords = extract_keywords(sent_doc, top_n) -# if len(keywords) < min_keywords: -# continue - -# # Find keyword positions using matcher -# keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) - -# # Skip if no keywords found in positions -# if not keyword_positions: -# continue - -# # Find proximity groups and build query -# groups = find_proximity_groups(keywords, keyword_positions, proximity_n) -# query = build_query(groups, proximity_n) - -# if query: -# queries.append(query) - -# return queries - - -# def batch_paragraphs_to_custom_queries( -# paragraphs: list[str], -# top_n: int = 10, -# proximity_n: int = 10, -# min_keywords: int = 1, -# n_process: int = 1, -# ) -> list[list[str]]: -# """ -# Processes multiple paragraphs using nlp.pipe for better performance. - -# Args: -# paragraphs (list[str]): list of paragraphs to process. -# top_n (int): Number of top keywords to extract per sentence. -# proximity_n (int): The proximity window for NEAR/n. -# min_keywords (int): Minimum number of keywords required to form a query. -# n_process (int): Number of processes to use for multiprocessing. - -# Returns: -# list[list[str]]: A list where each element is a list of queries for a paragraph. -# """ -# results = [] -# for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): -# queries = [] -# for sent in doc.sents: -# sent_doc = sent.as_doc() -# keywords = extract_keywords(sent_doc, top_n) -# if len(keywords) < min_keywords: -# continue -# keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) -# if not keyword_positions: -# continue -# groups = find_proximity_groups(keywords, keyword_positions, proximity_n) -# query = build_query(groups, proximity_n) -# if query: -# queries.append(query) -# results.append(queries) - -# return results - - -# def build_ts_query(groups: list[set[str]], proximity_n: int = 10) -> str: -# """ -# Builds a PostgreSQL tsquery string from groups of keywords. - -# Args: -# groups (list[set[str]]): List of keyword groups -# proximity_n (int): Maximum distance between words for proximity search - -# Returns: -# str: PostgreSQL tsquery compatible string -# """ -# if not groups: -# return "" - -# query_parts = [] - -# for group in groups: -# if not group: # Skip empty groups -# continue - -# if len(group) == 1: -# # Single word - just wrap in quotes -# word = next(iter(group)) -# # No need to check for stopwords since they should be filtered earlier -# query_parts.append(f"'{word.lower()}'") -# else: -# # Multiple words - sort by length (descending) and connect with <-> -# sorted_words = sorted(group, key=len, reverse=True) -# filtered_words = [word.lower() for word in sorted_words] -# if filtered_words: -# phrase = " <-> ".join(f"'{word}'" for word in filtered_words) -# query_parts.append(f"({phrase})") - -# return " & ".join(query_parts) if query_parts else "" - - @lru_cache(maxsize=1000) def text_to_tsvector_query( paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1 ) -> str: """ - Optimized paragraph processing with minimal behavior changes. - Added min_keywords parameter to filter out low-value queries. + Extracts meaningful keywords/phrases from text and joins them with OR. + + Example: + Input: "I like basketball especially Michael Jordan" + Output: "basketball OR Michael Jordan" Args: - paragraph (str): The input paragraph to convert. - top_n (int): Number of top keywords to extract per sentence. - proximity_n (int): The proximity window for NEAR/n. - min_keywords (int): Minimum number of keywords required to form a query. - + paragraph (str): The input text to process + top_n (int): Number of top keywords to extract per sentence + proximity_n (int): The proximity window for grouping related keywords + min_keywords (int): Minimum number of keywords required Returns: - str: PostgreSQL tsquery compatible string + str: Keywords/phrases joined by OR """ if not paragraph or not paragraph.strip(): - return [] + return "" - # Process entire paragraph once doc = nlp(paragraph) - queries = [] + queries = set() # Use set to avoid duplicates - # Process sentences for sent in doc.sents: - # Convert to doc for consistent API sent_doc = sent.as_doc() - - # Extract and clean keywords + + # Extract keywords keywords = extract_keywords(sent_doc, top_n) if len(keywords) < min_keywords: continue - # Find keyword positions using matcher + # Find keyword positions keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) - - # Skip if no keywords found in positions if not keyword_positions: continue - # Find proximity groups and build query + # Group related keywords by proximity groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - if groups: - queries.append(" OR ".join([" OR ".join(grp) for grp in groups])) - return queries + # Add each group as a single term to our set + for group in groups: + if len(group) > 1: + # Sort by length descending to prioritize longer phrases + sorted_group = sorted(group, key=len, reverse=True) + # For truly proximate multi-word groups, group words + queries.add(" OR ".join(sorted_group)) + else: + # For non-proximate words or single words, add them separately + queries.update(group) + + # Join all terms with " OR " + return " OR ".join(queries) if queries else "" def batch_text_to_tsvector_queries( @@ -388,7 +251,7 @@ def batch_text_to_tsvector_queries( proximity_n: int = 10, min_keywords: int = 1, n_process: int = 1, -) -> list[list[str]]: +) -> list[str]: """ Processes multiple paragraphs using nlp.pipe for better performance. @@ -402,7 +265,7 @@ def batch_text_to_tsvector_queries( results = [] for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): - queries = [] + queries = set() # Use set to avoid duplicates for sent in doc.sents: sent_doc = sent.as_doc() keywords = extract_keywords(sent_doc, top_n) @@ -412,9 +275,18 @@ def batch_text_to_tsvector_queries( if not keyword_positions: continue groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - if groups: - queries.append(" OR ".join([" OR ".join(grp) for grp in groups])) - - results.append(queries) + # Add each group as a single term to our set + for group in groups: + if len(group) > 1: + # Sort by length descending to prioritize longer phrases + sorted_group = sorted(group, key=len, reverse=True) + # For truly proximate multi-word groups, group words + queries.add(" OR ".join(sorted_group)) + else: + # For non-proximate words or single words, add them separately + queries.update(group) + + # Join all terms with " OR " + results.append(" OR ".join(queries) if queries else "") return results From ab8e3b782c0cfb068c9e9320e6015613a11b4647 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Mon, 13 Jan 2025 17:52:45 -0500 Subject: [PATCH 09/27] chore(tests): added test for the nlp utility + FTS search --- agents-api/tests/test_docs_queries.py | 994 ++++++++++++-------------- 1 file changed, 469 insertions(+), 525 deletions(-) diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index 70f13a129..fd82d7396 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -1,253 +1,366 @@ +from agents_api.autogen.openapi_model import CreateDocRequest from agents_api.clients.pg import create_db_pool +from agents_api.queries.docs.create_doc import create_doc +from agents_api.queries.docs.delete_doc import delete_doc +from agents_api.queries.docs.get_doc import get_doc +from agents_api.queries.docs.list_docs import list_docs from agents_api.queries.docs.search_docs_by_embedding import search_docs_by_embedding +from agents_api.queries.docs.search_docs_by_text import search_docs_by_text +from agents_api.queries.docs.search_docs_hybrid import search_docs_hybrid from ward import test +from agents_api.common.nlp import text_to_tsvector_query + from .fixtures import ( pg_dsn, test_agent, test_developer, + test_doc, test_doc_with_embedding, + test_user, ) EMBEDDING_SIZE: int = 1024 +import math + +def make_vector_with_similarity(n: int, d: float): + """ + Returns a list `v` of length `n` such that the cosine similarity + between `v` and the all-ones vector of length `n` is approximately d. + """ + if not -1.0 <= d <= 1.0: + raise ValueError("d must lie in [-1, 1].") + + # Handle special cases exactly: + if abs(d - 1.0) < 1e-12: # d ~ +1 + return [1.0] * n + if abs(d + 1.0) < 1e-12: # d ~ -1 + return [-1.0] * n + if abs(d) < 1e-12: # d ~ 0 + v = [0.0]*n + if n >= 2: + v[0] = 1.0 + v[1] = -1.0 + return v + + sign_d = 1.0 if d >= 0 else -1.0 + + # Base part: sign(d)*[1,1,...,1] + base = [sign_d]*n + + # Orthogonal unit vector u with sum(u)=0; for simplicity: + # u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0] + u = [0.0]*n + if n >= 2: + u[0] = 1.0 / math.sqrt(2) + u[1] = -1.0 / math.sqrt(2) + # (if n=1, there's no truly orthogonal vector to [1], so skip) + + # Solve for alpha: + # alpha^2 = n*(1 - d^2)/d^2 + alpha = math.sqrt(n*(1 - d*d)) / abs(d) + + # Construct v + v = [0.0]*n + for i in range(n): + v[i] = base[i] + alpha * u[i] + + return v + + +@test("query: create user doc") +async def _(dsn=pg_dsn, developer=test_developer, user=test_user): + pool = await create_db_pool(dsn=dsn) + doc_created = await create_doc( + developer_id=developer.id, + data=CreateDocRequest( + title="User Doc", + content=["Docs for user testing", "Docs for user testing 2"], + metadata={"test": "test"}, + embed_instruction="Embed the document", + ), + owner_type="user", + owner_id=user.id, + connection_pool=pool, + ) -# @test("query: create user doc") -# async def _(dsn=pg_dsn, developer=test_developer, user=test_user): -# pool = await create_db_pool(dsn=dsn) -# doc_created = await create_doc( -# developer_id=developer.id, -# data=CreateDocRequest( -# title="User Doc", -# content=["Docs for user testing", "Docs for user testing 2"], -# metadata={"test": "test"}, -# embed_instruction="Embed the document", -# ), -# owner_type="user", -# owner_id=user.id, -# connection_pool=pool, -# ) - -# assert doc_created.id is not None - -# # Verify doc appears in user's docs -# found = await get_doc( -# developer_id=developer.id, -# doc_id=doc_created.id, -# connection_pool=pool, -# ) -# assert found.id == doc_created.id - - -# @test("query: create agent doc") -# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): -# pool = await create_db_pool(dsn=dsn) -# doc = await create_doc( -# developer_id=developer.id, -# data=CreateDocRequest( -# title="Agent Doc", -# content="Docs for agent testing", -# metadata={"test": "test"}, -# embed_instruction="Embed the document", -# ), -# owner_type="agent", -# owner_id=agent.id, -# connection_pool=pool, -# ) -# assert doc.id is not None - -# # Verify doc appears in agent's docs -# docs_list = await list_docs( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# connection_pool=pool, -# ) -# assert any(d.id == doc.id for d in docs_list) - - -# @test("query: get doc") -# async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc): -# pool = await create_db_pool(dsn=dsn) -# doc_test = await get_doc( -# developer_id=developer.id, -# doc_id=doc.id, -# connection_pool=pool, -# ) -# assert doc_test.id == doc.id -# assert doc_test.title is not None -# assert doc_test.content is not None - - -# @test("query: list user docs") -# async def _(dsn=pg_dsn, developer=test_developer, user=test_user): -# pool = await create_db_pool(dsn=dsn) + assert doc_created.id is not None -# # Create a doc owned by the user -# doc_user = await create_doc( -# developer_id=developer.id, -# data=CreateDocRequest( -# title="User List Test", -# content="Some user doc content", -# metadata={"test": "test"}, -# embed_instruction="Embed the document", -# ), -# owner_type="user", -# owner_id=user.id, -# connection_pool=pool, -# ) - -# # List user's docs -# docs_list = await list_docs( -# developer_id=developer.id, -# owner_type="user", -# owner_id=user.id, -# connection_pool=pool, -# ) -# assert len(docs_list) >= 1 -# assert any(d.id == doc_user.id for d in docs_list) - - -# @test("query: list agent docs") -# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): -# pool = await create_db_pool(dsn=dsn) + # Verify doc appears in user's docs + found = await get_doc( + developer_id=developer.id, + doc_id=doc_created.id, + connection_pool=pool, + ) + assert found.id == doc_created.id -# # Create a doc owned by the agent -# doc_agent = await create_doc( -# developer_id=developer.id, -# data=CreateDocRequest( -# title="Agent List Test", -# content="Some agent doc content", -# metadata={"test": "test"}, -# embed_instruction="Embed the document", -# ), -# owner_type="agent", -# owner_id=agent.id, -# connection_pool=pool, -# ) - -# # List agent's docs -# docs_list = await list_docs( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# connection_pool=pool, -# ) -# assert len(docs_list) >= 1 -# assert any(d.id == doc_agent.id for d in docs_list) - - -# @test("query: delete user doc") -# async def _(dsn=pg_dsn, developer=test_developer, user=test_user): -# pool = await create_db_pool(dsn=dsn) -# # Create a doc owned by the user -# doc_user = await create_doc( -# developer_id=developer.id, -# data=CreateDocRequest( -# title="User Delete Test", -# content="Doc for user deletion test", -# metadata={"test": "test"}, -# embed_instruction="Embed the document", -# ), -# owner_type="user", -# owner_id=user.id, -# connection_pool=pool, -# ) - -# # Delete the doc -# await delete_doc( -# developer_id=developer.id, -# doc_id=doc_user.id, -# owner_type="user", -# owner_id=user.id, -# connection_pool=pool, -# ) - -# # Verify doc is no longer in user's docs -# docs_list = await list_docs( -# developer_id=developer.id, -# owner_type="user", -# owner_id=user.id, -# connection_pool=pool, -# ) -# assert not any(d.id == doc_user.id for d in docs_list) - - -# @test("query: delete agent doc") -# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): -# pool = await create_db_pool(dsn=dsn) +@test("query: create agent doc") +async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): + pool = await create_db_pool(dsn=dsn) + doc = await create_doc( + developer_id=developer.id, + data=CreateDocRequest( + title="Agent Doc", + content="Docs for agent testing", + metadata={"test": "test"}, + embed_instruction="Embed the document", + ), + owner_type="agent", + owner_id=agent.id, + connection_pool=pool, + ) + assert doc.id is not None + + # Verify doc appears in agent's docs + docs_list = await list_docs( + developer_id=developer.id, + owner_type="agent", + owner_id=agent.id, + connection_pool=pool, + ) + assert any(d.id == doc.id for d in docs_list) -# # Create a doc owned by the agent -# doc_agent = await create_doc( -# developer_id=developer.id, -# data=CreateDocRequest( -# title="Agent Delete Test", -# content="Doc for agent deletion test", -# metadata={"test": "test"}, -# embed_instruction="Embed the document", -# ), -# owner_type="agent", -# owner_id=agent.id, -# connection_pool=pool, -# ) - -# # Delete the doc -# await delete_doc( -# developer_id=developer.id, -# doc_id=doc_agent.id, -# owner_type="agent", -# owner_id=agent.id, -# connection_pool=pool, -# ) - -# # Verify doc is no longer in agent's docs -# docs_list = await list_docs( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# connection_pool=pool, -# ) -# assert not any(d.id == doc_agent.id for d in docs_list) - - -# @test("query: search docs by text") -# async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer): -# pool = await create_db_pool(dsn=dsn) -# # Create a test document -# doc = await create_doc( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# data=CreateDocRequest( -# title="Hello", -# content="The world is a funny little thing", -# metadata={"test": "test"}, -# embed_instruction="Embed the document", -# ), -# connection_pool=pool, -# ) - -# # Search using simpler terms first -# result = await search_docs_by_text( -# developer_id=developer.id, -# owners=[("agent", agent.id)], -# query="world", -# k=3, -# search_language="english", -# metadata_filter={"test": "test"}, -# connection_pool=pool, -# ) - -# print("\nSearch results:", result) - -# # More specific assertions -# assert len(result) >= 1, "Should find at least one document" -# assert any(d.id == doc.id for d in result), f"Should find document {doc.id}" -# assert result[0].metadata == {"test": "test"}, "Metadata should match" +@test("query: get doc") +async def _(dsn=pg_dsn, developer=test_developer, doc=test_doc): + pool = await create_db_pool(dsn=dsn) + doc_test = await get_doc( + developer_id=developer.id, + doc_id=doc.id, + connection_pool=pool, + ) + assert doc_test.id == doc.id + assert doc_test.title is not None + assert doc_test.content is not None +@test("query: list user docs") +async def _(dsn=pg_dsn, developer=test_developer, user=test_user): + pool = await create_db_pool(dsn=dsn) + + # Create a doc owned by the user + doc_user = await create_doc( + developer_id=developer.id, + data=CreateDocRequest( + title="User List Test", + content="Some user doc content", + metadata={"test": "test"}, + embed_instruction="Embed the document", + ), + owner_type="user", + owner_id=user.id, + connection_pool=pool, + ) + + # List user's docs + docs_list = await list_docs( + developer_id=developer.id, + owner_type="user", + owner_id=user.id, + connection_pool=pool, + ) + assert len(docs_list) >= 1 + assert any(d.id == doc_user.id for d in docs_list) + + +@test("query: list agent docs") +async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): + pool = await create_db_pool(dsn=dsn) + + # Create a doc owned by the agent + doc_agent = await create_doc( + developer_id=developer.id, + data=CreateDocRequest( + title="Agent List Test", + content="Some agent doc content", + metadata={"test": "test"}, + embed_instruction="Embed the document", + ), + owner_type="agent", + owner_id=agent.id, + connection_pool=pool, + ) + + # List agent's docs + docs_list = await list_docs( + developer_id=developer.id, + owner_type="agent", + owner_id=agent.id, + connection_pool=pool, + ) + assert len(docs_list) >= 1 + assert any(d.id == doc_agent.id for d in docs_list) + + +@test("query: delete user doc") +async def _(dsn=pg_dsn, developer=test_developer, user=test_user): + pool = await create_db_pool(dsn=dsn) + + # Create a doc owned by the user + doc_user = await create_doc( + developer_id=developer.id, + data=CreateDocRequest( + title="User Delete Test", + content="Doc for user deletion test", + metadata={"test": "test"}, + embed_instruction="Embed the document", + ), + owner_type="user", + owner_id=user.id, + connection_pool=pool, + ) + + # Delete the doc + await delete_doc( + developer_id=developer.id, + doc_id=doc_user.id, + owner_type="user", + owner_id=user.id, + connection_pool=pool, + ) + + # Verify doc is no longer in user's docs + docs_list = await list_docs( + developer_id=developer.id, + owner_type="user", + owner_id=user.id, + connection_pool=pool, + ) + assert not any(d.id == doc_user.id for d in docs_list) + + +@test("query: delete agent doc") +async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): + pool = await create_db_pool(dsn=dsn) + + # Create a doc owned by the agent + doc_agent = await create_doc( + developer_id=developer.id, + data=CreateDocRequest( + title="Agent Delete Test", + content="Doc for agent deletion test", + metadata={"test": "test"}, + embed_instruction="Embed the document", + ), + owner_type="agent", + owner_id=agent.id, + connection_pool=pool, + ) + + # Delete the doc + await delete_doc( + developer_id=developer.id, + doc_id=doc_agent.id, + owner_type="agent", + owner_id=agent.id, + connection_pool=pool, + ) + + # Verify doc is no longer in agent's docs + docs_list = await list_docs( + developer_id=developer.id, + owner_type="agent", + owner_id=agent.id, + connection_pool=pool, + ) + assert not any(d.id == doc_agent.id for d in docs_list) + + +@test("query: search docs by text") +async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer): + pool = await create_db_pool(dsn=dsn) + + # Create a test document + doc = await create_doc( + developer_id=developer.id, + owner_type="agent", + owner_id=agent.id, + data=CreateDocRequest( + title="Hello", + content="The world is a funny little thing", + metadata={"test": "test"}, + embed_instruction="Embed the document", + ), + connection_pool=pool, + ) + + # Search using simpler terms first + result = await search_docs_by_text( + developer_id=developer.id, + owners=[("agent", agent.id)], + query="world", + k=3, + search_language="english", + metadata_filter={"test": "test"}, + connection_pool=pool, + ) + + print("\nSearch results:", result) + + # More specific assertions + assert len(result) >= 1, "Should find at least one document" + assert any(d.id == doc.id for d in result), f"Should find document {doc.id}" + assert result[0].metadata == {"test": "test"}, "Metadata should match" + +@test("query: search docs by text with technical terms and phrases") +async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): + pool = await create_db_pool(dsn=dsn) + + # Create documents with technical content + doc1 = await create_doc( + developer_id=developer.id, + owner_type="agent", + owner_id=agent.id, + data=CreateDocRequest( + title="Technical Document", + content="API endpoints using REST architecture with JSON payloads", + metadata={"domain": "technical"}, + embed_instruction="Embed the document", + ), + connection_pool=pool, + ) + + doc2 = await create_doc( + developer_id=developer.id, + owner_type="agent", + owner_id=agent.id, + data=CreateDocRequest( + title="More Technical Terms", + content="Database optimization using indexing and query planning", + metadata={"domain": "technical"}, + embed_instruction="Embed the document", + ), + connection_pool=pool, + ) + + # Test with technical terms + technical_queries = [ + "API endpoints", + "REST architecture", + "database optimization", + "indexing" + ] + + for query in technical_queries: + results = await search_docs_by_text( + developer_id=developer.id, + owners=[("agent", agent.id)], + query=query, + k=3, + search_language="english", + connection_pool=pool, + ) + + print(f"\nSearch results for '{query}':", results) + + # Verify appropriate document is found based on query + if "API" in query or "REST" in query: + assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'" + if "database" in query.lower() or "indexing" in query: + assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'" + @test("query: search docs by embedding") async def _( dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding @@ -273,318 +386,149 @@ async def _( assert result[0].metadata is not None -# @test("query: search docs by hybrid") -# async def _( -# dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding -# ): -# pool = await create_db_pool(dsn=dsn) - -# # Get query embedding by averaging the embeddings (list of floats) -# query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)] - -# # Search using the correct parameter types -# result = await search_docs_hybrid( -# developer_id=developer.id, -# owners=[("agent", agent.id)], -# text_query=doc.content[0] if isinstance(doc.content, list) else doc.content, -# embedding=query_embedding, -# k=3, # Add k parameter -# metadata_filter={"test": "test"}, # Add metadata filter -# connection_pool=pool, -# ) - -# assert len(result) >= 1 -# assert result[0].metadata is not None +@test("query: search docs by hybrid") +async def _( + dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding +): + pool = await create_db_pool(dsn=dsn) + # Get query embedding by averaging the embeddings (list of floats) + query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)] -# @test("query: test tsvector with technical terms and phrases") -# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): -# pool = await create_db_pool(dsn=dsn) + # Search using the correct parameter types + result = await search_docs_hybrid( + developer_id=developer.id, + owners=[("agent", agent.id)], + text_query=doc.content[0] if isinstance(doc.content, list) else doc.content, + embedding=query_embedding, + k=3, # Add k parameter + metadata_filter={"test": "test"}, # Add metadata filter + connection_pool=pool, + ) -# # Create documents with technical content -# doc1 = await create_doc( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# data=CreateDocRequest( -# title="Technical Document", -# content="API endpoints using REST architecture with JSON payloads", -# metadata={"domain": "technical"}, -# embed_instruction="Embed the document", -# ), -# connection_pool=pool, -# ) - -# doc2 = await create_doc( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# data=CreateDocRequest( -# title="More Technical Terms", -# content="Database optimization using indexing and query planning", -# metadata={"domain": "technical"}, -# embed_instruction="Embed the document", -# ), -# connection_pool=pool, -# ) - -# # Test with technical terms -# technical_queries = [ -# "API endpoints", -# "REST architecture", -# "database optimization", -# "indexing" -# ] + assert len(result) >= 1 + assert result[0].metadata is not None -# for query in technical_queries: -# results = await search_docs_by_text( -# developer_id=developer.id, -# owners=[("agent", agent.id)], -# query=query, -# k=3, -# search_language="english", -# connection_pool=pool, -# ) +@test("utility: test text_to_tsvector_query") +async def _(): + test_cases = [ + # Single words + ( + "test", + "test" + ), + + # Multiple words in single sentence + ( + "quick brown fox", + "quick brown fox" # Now kept as a single phrase due to proximity + ), + + # Technical terms and phrases + ( + "Machine Learning algorithm", + "machine learning algorithm" # Common technical phrase + ), + # Multiple sentences + ( + "Machine learning is great. Data science rocks.", + "machine learning OR data science rocks" + ), + + # Quoted phrases + ( + '"quick brown fox"', + "quick brown fox" # Quotes removed, phrase kept together + ), + ( + 'Find "machine learning" algorithms', + "machine learning" + ), + + # Multiple quoted phrases + ( + '"data science" and "machine learning"', + "machine learning OR data science" + ), + + # Edge cases + ( + "", + "" + ), + ( + "the and or", + "" # All stop words should result in empty string + ), + ( + "a", + "" # Single stop word should result in empty string + ), + ( + "X", + "X" + ), + + # Empty quotes + ( + '""', + "" + ), + ( + 'test "" phrase', + "phrase OR test" + ), + ] -# print(f"\nSearch results for '{query}':", results) + for input_text, expected_output in test_cases: + print(f"Input: '{input_text}'") + result = text_to_tsvector_query(input_text) + print(f"Generated query: '{result}'") + print(f"Expected: '{expected_output}'\n") + assert result.lower() == expected_output.lower(), \ + f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" -# # Verify appropriate document is found based on query -# if "API" in query or "REST" in query: -# assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'" -# if "database" in query.lower() or "indexing" in query: -# assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'" -# @test("query: test tsvector with varying content lengths and special characters") -# async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): +# @test("query: search docs by embedding with different confidence levels") +# async def _( +# dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding +# ): # pool = await create_db_pool(dsn=dsn) -# # Create documents with different content lengths -# short_doc = await create_doc( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# data=CreateDocRequest( -# title="Short", -# content="Brief test document", -# metadata={"length": "short"}, -# embed_instruction="Embed the document", -# ), -# connection_pool=pool, -# ) - -# medium_doc = await create_doc( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# data=CreateDocRequest( -# title="Medium", -# content="This is a medium length document that contains more words and context for testing purposes", -# metadata={"length": "medium"}, -# embed_instruction="Embed the document", -# ), -# connection_pool=pool, -# ) - -# long_doc = await create_doc( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# data=CreateDocRequest( -# title="Long", -# content="This is a much longer document that contains multiple sentences. It includes various terms and phrases. \ -# The purpose is to test how the search handles longer content with more context. \ -# It should still be able to find relevant matches based on the search query.", -# metadata={"length": "long"}, -# embed_instruction="Embed the document", -# ), -# connection_pool=pool, -# ) - -# special_doc = await create_doc( -# developer_id=developer.id, -# owner_type="agent", -# owner_id=agent.id, -# data=CreateDocRequest( -# title="Special Characters", -# content="Testing! With? Different... punctuation; marks: and-hyphens, plus+signs & ampersands", -# metadata={"type": "special"}, -# embed_instruction="Embed the document", -# ), -# connection_pool=pool, -# ) - -# # Test cases for different content lengths -# length_test_cases = [ -# ("brief test", short_doc.id), -# ("medium length document", medium_doc.id), -# ("multiple sentences", long_doc.id), -# ("document", None) # Should find all documents +# # Get query embedding (using original doc's embedding) +# query_embedding = make_vector_with_similarity(EMBEDDING_SIZE, 0.7) + +# # Test with different confidence levels +# confidence_tests = [ +# (0.99, 0), # Very high similarity threshold - should find no results +# (0.7, 1), # High similarity - should find 1 result (the embedding with all 1.0s) +# (0.3, 2), # Medium similarity - should find 2 results (including 0.3-0.7 embedding) +# (-0.8, 3), # Low similarity - should find 3 results (including -0.8 to 0.8 embedding) +# (-1.0, 4) # Lowest similarity - should find all 4 results (including alternating -1/1) # ] -# for query, expected_doc_id in length_test_cases: -# results = await search_docs_by_text( +# for confidence, expected_min_results in confidence_tests: +# results = await search_docs_by_embedding( # developer_id=developer.id, # owners=[("agent", agent.id)], -# query=query, +# embedding=query_embedding, # k=3, -# search_language="english", +# confidence=confidence, +# metadata_filter={"test": "test"}, # connection_pool=pool, # ) -# print(f"\nSearch results for '{query}':", results) - -# if expected_doc_id: -# assert any(doc.id == expected_doc_id for doc in results), \ -# f"Expected document should be found with query '{query}'" -# else: -# # For general terms, verify multiple documents are found -# assert len(results) > 1, f"Multiple documents should be found with query '{query}'" - -# @test("query: test direct tsvector generation") -# async def _(): -# test_cases = [ -# # Single words -# ( -# "test", -# "'test'" -# ), -# ( -# "testing", -# "'testing'" -# ), - -# # Multiple words in single sentence -# ( -# "quick brown fox", -# "'quick' & 'brown' & 'fox'" -# ), -# ( -# "The Quick Brown Fox", -# "'quick' & 'brown' & 'fox'" -# ), - -# # Technical terms and phrases -# ( -# "machine learning algorithm", -# "('machine' <-> 'learning') & 'algorithm'" -# ), -# ( -# "REST API implementation", -# "'rest' & 'api' & 'implementation'" -# ), - -# # Multiple sentences -# ( -# "Machine learning is great. Data science rocks.", -# "('machine' <-> 'learning') & 'great' | ('data' <-> 'science') & 'rocks'" -# ), - -# # Quoted phrases -# ( -# '"quick brown fox"', -# "('quick' <-> 'brown' <-> 'fox')" -# ), -# ( -# 'Find "machine learning" algorithms', -# "('machine' <-> 'learning') & 'algorithms' & 'find'" -# ), - -# # Multiple quoted phrases -# ( -# '"data science" and "machine learning"', -# "('data' <-> 'science') & ('machine' <-> 'learning')" -# ), - -# # Edge cases -# ( -# "", -# "" -# ), -# ( -# "the and or", -# "" -# ), -# ( -# "a", -# "" -# ), -# ( -# "X", -# "'x'" -# ), - -# # Empty quotes -# ( -# '""', -# "" -# ), -# ( -# 'test "" phrase', -# "'test' & 'phrase'" -# ), -# ] - -# for input_text, expected_output in test_cases: -# result = text_to_tsvector_query(input_text) -# print(f"\nInput: '{input_text}'") -# print(f"Generated tsquery: '{result}'") -# print(f"Expected: '{expected_output}'") -# assert result == expected_output, \ -# f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" - +# print(f"\nSearch results with confidence {confidence}:") +# for r in results: +# print(f"- Doc ID: {r.id}, Distance: {r.distance}") -@test("query: search docs by embedding with different confidence levels") -async def _( - dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding -): - pool = await create_db_pool(dsn=dsn) - - # Create a test document with a different embedding - # different_embedding = [0.5] * EMBEDDING_SIZE # Create different embedding values - # await pool.execute( - # """ - # INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) - # VALUES ($1, $2, 0, 1, $3, $4) - # """, # Changed chunk_seq from 0 to 1 - # developer.id, - # doc.id, - # "Different test content", - # f"[{', '.join([str(x) for x in different_embedding])}]", - # ) - - # Get query embedding (using original doc's embedding) - query_embedding = [sum(k) / len(k) for k in zip(*doc.embeddings)] - - # Test with different confidence levels - confidence_tests = [ - (0.99, 0), # High confidence should find no results - (0.7, 1), # Medium confidence should find some results - (0.5, 2), # Lower confidence should find more results - (0.1, 2), # Very low confidence should find all results - ] - - for confidence, expected_min_results in confidence_tests: - results = await search_docs_by_embedding( - developer_id=developer.id, - owners=[("agent", agent.id)], - embedding=query_embedding, - k=3, - confidence=confidence, - metadata_filter={"test": "test"}, - connection_pool=pool, - ) - - print(f"\nSearch results with confidence {confidence}:") - for r in results: - print(f"- Doc ID: {r.id}, Distance: {r.distance}") - - assert len(results) >= expected_min_results, ( - f"Expected at least {expected_min_results} results with confidence {confidence}, got {len(results)}" - ) +# assert len(results) >= expected_min_results, ( +# f"Expected at least {expected_min_results} results with confidence {confidence}, got {len(results)}" +# ) - if results: - # Verify that all returned results meet the confidence threshold - for result in results: - assert result.distance >= confidence, ( - f"Result distance {result.distance} is below confidence threshold {confidence}" - ) +# if results: +# # Verify that all returned results meet the confidence threshold +# for result in results: +# assert result.distance >= confidence, ( +# f"Result distance {result.distance} is below confidence threshold {confidence}" +# ) From 41ae093075e3279b78adc2fa03fe645f8c8e8077 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Mon, 13 Jan 2025 17:53:29 -0500 Subject: [PATCH 10/27] chore: misc code refactor --- .../queries/docs/search_docs_by_text.py | 3 ++ agents-api/tests/fixtures.py | 31 +++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py index 77fb3a0e6..44e1bb731 100644 --- a/agents-api/agents_api/queries/docs/search_docs_by_text.py +++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py @@ -7,6 +7,7 @@ from ...autogen.openapi_model import DocReference from ...common.utils.db_exceptions import common_db_exceptions from ..utils import pg_query, rewrap_exceptions, wrap_in_class +from ...common.nlp import text_to_tsvector_query from .utils import transform_to_doc_reference # Raw query for text search @@ -60,6 +61,8 @@ async def search_docs_by_text( # Extract owner types and IDs owner_types: list[str] = [owner[0] for owner in owners] owner_ids: list[str] = [str(owner[1]) for owner in owners] + # Pre-process rawtext query + # query = text_to_tsvector_query(query) return ( search_docs_text_query, diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py index 166bbef73..43eb47b9a 100644 --- a/agents-api/tests/fixtures.py +++ b/agents-api/tests/fixtures.py @@ -175,15 +175,40 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test f"[{', '.join([str(x) for x in [1.0] * 1024])}]", ) + # Insert embedding with random values between 0.3 and 0.7 await pool.execute( """ INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) VALUES ($1, $2, 0, 1, $3, $4) - """, # Changed chunk_seq from 0 to 1 + """, developer.id, doc.id, - "Different test content", - f"[{', '.join([str(x) for x in [0.5] * 1024])}]", + "Test content 1", + f"[{', '.join([str(0.3 + 0.4 * (i % 3) / 2) for i in range(1024)])}]", + ) + + # Insert embedding with random values between -0.8 and 0.8 + await pool.execute( + """ + INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) + VALUES ($1, $2, 0, 2, $3, $4) + """, + developer.id, + doc.id, + "Test content 2", + f"[{', '.join([str(-0.8 + 1.6 * (i % 5) / 4) for i in range(1024)])}]", + ) + + # Insert embedding with alternating -1 and 1 + await pool.execute( + """ + INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) + VALUES ($1, $2, 0, 3, $3, $4) + """, + developer.id, + doc.id, + "Test content 3", + f"[{', '.join([str(-1 if i % 2 else 1) for i in range(1024)])}]", ) yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool) From 25a2e6560b1aba85431cad5cc5302a627e173c8c Mon Sep 17 00:00:00 2001 From: Vedantsahai18 Date: Mon, 13 Jan 2025 22:55:10 +0000 Subject: [PATCH 11/27] refactor: Lint agents-api (CI) --- agents-api/agents_api/common/nlp.py | 15 +-- .../queries/docs/search_docs_by_text.py | 1 - agents-api/tests/fixtures.py | 4 +- agents-api/tests/test_docs_queries.py | 96 ++++++++----------- 4 files changed, 49 insertions(+), 67 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 233517aa8..be86d8936 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -96,7 +96,8 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str] ent_spans = [ent for ent in doc.ents if ent.label_ not in excluded_labels] # Add more comprehensive stopword filtering for noun chunks chunk_spans = [ - chunk for chunk in doc.noun_chunks + chunk + for chunk in doc.noun_chunks if not chunk.root.is_stop and not all(token.is_stop for token in chunk) ] all_spans = filter_spans(ent_spans + chunk_spans) @@ -109,7 +110,7 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str] # Skip if all tokens in span are stopwords if all(token.is_stop for token in span): continue - + text = span.text.strip() lower_text = text.lower() @@ -194,7 +195,7 @@ def text_to_tsvector_query( ) -> str: """ Extracts meaningful keywords/phrases from text and joins them with OR. - + Example: Input: "I like basketball especially Michael Jordan" Output: "basketball OR Michael Jordan" @@ -216,7 +217,7 @@ def text_to_tsvector_query( for sent in doc.sents: sent_doc = sent.as_doc() - + # Extract keywords keywords = extract_keywords(sent_doc, top_n) if len(keywords) < min_keywords: @@ -235,7 +236,7 @@ def text_to_tsvector_query( if len(group) > 1: # Sort by length descending to prioritize longer phrases sorted_group = sorted(group, key=len, reverse=True) - # For truly proximate multi-word groups, group words + # For truly proximate multi-word groups, group words queries.add(" OR ".join(sorted_group)) else: # For non-proximate words or single words, add them separately @@ -265,7 +266,7 @@ def batch_text_to_tsvector_queries( results = [] for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): - queries = set() # Use set to avoid duplicates + queries = set() # Use set to avoid duplicates for sent in doc.sents: sent_doc = sent.as_doc() keywords = extract_keywords(sent_doc, top_n) @@ -280,7 +281,7 @@ def batch_text_to_tsvector_queries( if len(group) > 1: # Sort by length descending to prioritize longer phrases sorted_group = sorted(group, key=len, reverse=True) - # For truly proximate multi-word groups, group words + # For truly proximate multi-word groups, group words queries.add(" OR ".join(sorted_group)) else: # For non-proximate words or single words, add them separately diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py index 44e1bb731..b1758625b 100644 --- a/agents-api/agents_api/queries/docs/search_docs_by_text.py +++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py @@ -7,7 +7,6 @@ from ...autogen.openapi_model import DocReference from ...common.utils.db_exceptions import common_db_exceptions from ..utils import pg_query, rewrap_exceptions, wrap_in_class -from ...common.nlp import text_to_tsvector_query from .utils import transform_to_doc_reference # Raw query for text search diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py index 43eb47b9a..a5dc7dc32 100644 --- a/agents-api/tests/fixtures.py +++ b/agents-api/tests/fixtures.py @@ -187,7 +187,7 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test f"[{', '.join([str(0.3 + 0.4 * (i % 3) / 2) for i in range(1024)])}]", ) - # Insert embedding with random values between -0.8 and 0.8 + # Insert embedding with random values between -0.8 and 0.8 await pool.execute( """ INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) @@ -195,7 +195,7 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test """, developer.id, doc.id, - "Test content 2", + "Test content 2", f"[{', '.join([str(-0.8 + 1.6 * (i % 5) / 4) for i in range(1024)])}]", ) diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index fd82d7396..54e182bd9 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -1,5 +1,6 @@ from agents_api.autogen.openapi_model import CreateDocRequest from agents_api.clients.pg import create_db_pool +from agents_api.common.nlp import text_to_tsvector_query from agents_api.queries.docs.create_doc import create_doc from agents_api.queries.docs.delete_doc import delete_doc from agents_api.queries.docs.get_doc import get_doc @@ -9,8 +10,6 @@ from agents_api.queries.docs.search_docs_hybrid import search_docs_hybrid from ward import test -from agents_api.common.nlp import text_to_tsvector_query - from .fixtures import ( pg_dsn, test_agent, @@ -24,34 +23,36 @@ import math + def make_vector_with_similarity(n: int, d: float): """ Returns a list `v` of length `n` such that the cosine similarity between `v` and the all-ones vector of length `n` is approximately d. """ if not -1.0 <= d <= 1.0: - raise ValueError("d must lie in [-1, 1].") - + msg = "d must lie in [-1, 1]." + raise ValueError(msg) + # Handle special cases exactly: if abs(d - 1.0) < 1e-12: # d ~ +1 return [1.0] * n if abs(d + 1.0) < 1e-12: # d ~ -1 return [-1.0] * n - if abs(d) < 1e-12: # d ~ 0 - v = [0.0]*n + if abs(d) < 1e-12: # d ~ 0 + v = [0.0] * n if n >= 2: v[0] = 1.0 v[1] = -1.0 return v sign_d = 1.0 if d >= 0 else -1.0 - + # Base part: sign(d)*[1,1,...,1] - base = [sign_d]*n - + base = [sign_d] * n + # Orthogonal unit vector u with sum(u)=0; for simplicity: # u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0] - u = [0.0]*n + u = [0.0] * n if n >= 2: u[0] = 1.0 / math.sqrt(2) u[1] = -1.0 / math.sqrt(2) @@ -59,13 +60,13 @@ def make_vector_with_similarity(n: int, d: float): # Solve for alpha: # alpha^2 = n*(1 - d^2)/d^2 - alpha = math.sqrt(n*(1 - d*d)) / abs(d) + alpha = math.sqrt(n * (1 - d * d)) / abs(d) # Construct v - v = [0.0]*n + v = [0.0] * n for i in range(n): v[i] = base[i] + alpha * u[i] - + return v @@ -304,6 +305,7 @@ async def _(dsn=pg_dsn, agent=test_agent, developer=test_developer): assert any(d.id == doc.id for d in result), f"Should find document {doc.id}" assert result[0].metadata == {"test": "test"}, "Metadata should match" + @test("query: search docs by text with technical terms and phrases") async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): pool = await create_db_pool(dsn=dsn) @@ -340,7 +342,7 @@ async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): "API endpoints", "REST architecture", "database optimization", - "indexing" + "indexing", ] for query in technical_queries: @@ -357,9 +359,14 @@ async def _(dsn=pg_dsn, developer=test_developer, agent=test_agent): # Verify appropriate document is found based on query if "API" in query or "REST" in query: - assert any(doc.id == doc1.id for doc in results), f"Doc1 should be found with query '{query}'" + assert any(doc.id == doc1.id for doc in results), ( + f"Doc1 should be found with query '{query}'" + ) if "database" in query.lower() or "indexing" in query: - assert any(doc.id == doc2.id for doc in results), f"Doc2 should be found with query '{query}'" + assert any(doc.id == doc2.id for doc in results), ( + f"Doc2 should be found with query '{query}'" + ) + @test("query: search docs by embedding") async def _( @@ -409,75 +416,49 @@ async def _( assert len(result) >= 1 assert result[0].metadata is not None + @test("utility: test text_to_tsvector_query") async def _(): test_cases = [ # Single words - ( - "test", - "test" - ), - + ("test", "test"), # Multiple words in single sentence ( "quick brown fox", - "quick brown fox" # Now kept as a single phrase due to proximity + "quick brown fox", # Now kept as a single phrase due to proximity ), - # Technical terms and phrases ( "Machine Learning algorithm", - "machine learning algorithm" # Common technical phrase + "machine learning algorithm", # Common technical phrase ), # Multiple sentences ( "Machine learning is great. Data science rocks.", - "machine learning OR data science rocks" + "machine learning OR data science rocks", ), - # Quoted phrases ( '"quick brown fox"', - "quick brown fox" # Quotes removed, phrase kept together + "quick brown fox", # Quotes removed, phrase kept together ), - ( - 'Find "machine learning" algorithms', - "machine learning" - ), - + ('Find "machine learning" algorithms', "machine learning"), # Multiple quoted phrases - ( - '"data science" and "machine learning"', - "machine learning OR data science" - ), - + ('"data science" and "machine learning"', "machine learning OR data science"), # Edge cases - ( - "", - "" - ), + ("", ""), ( "the and or", - "" # All stop words should result in empty string + "", # All stop words should result in empty string ), ( "a", - "" # Single stop word should result in empty string + "", # Single stop word should result in empty string ), - ( - "X", - "X" - ), - + ("X", "X"), # Empty quotes - ( - '""', - "" - ), - ( - 'test "" phrase', - "phrase OR test" - ), + ('""', ""), + ('test "" phrase', "phrase OR test"), ] for input_text, expected_output in test_cases: @@ -485,8 +466,9 @@ async def _(): result = text_to_tsvector_query(input_text) print(f"Generated query: '{result}'") print(f"Expected: '{expected_output}'\n") - assert result.lower() == expected_output.lower(), \ + assert result.lower() == expected_output.lower(), ( f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" + ) # @test("query: search docs by embedding with different confidence levels") From fd2481e4812996bc312943a799198147a45c0219 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Mon, 13 Jan 2025 18:03:26 -0500 Subject: [PATCH 12/27] chore: misc fix --- agents-api/tests/test_docs_queries.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index 54e182bd9..3abc91f2f 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -417,7 +417,7 @@ async def _( assert result[0].metadata is not None -@test("utility: test text_to_tsvector_query") +@test("utility: test for text_to_tsvector_query") async def _(): test_cases = [ # Single words @@ -434,8 +434,8 @@ async def _(): ), # Multiple sentences ( - "Machine learning is great. Data science rocks.", - "machine learning OR data science rocks", + "I love basketball especially Michael Jordan. LeBron James is also great.", + ["basketball OR lebron james OR michael jordan", "LeBron James OR Michael Jordan OR basketball"], ), # Quoted phrases ( @@ -466,9 +466,14 @@ async def _(): result = text_to_tsvector_query(input_text) print(f"Generated query: '{result}'") print(f"Expected: '{expected_output}'\n") - assert result.lower() == expected_output.lower(), ( - f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" - ) + if isinstance(expected_output, list): + assert any(result.lower() == expected_output.lower() for expected_output in expected_output), ( + f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" + ) + else: + assert result.lower() == expected_output.lower(), ( + f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" + ) # @test("query: search docs by embedding with different confidence levels") From 1a7eca2f872f96813cfb684cd1eea77c152c3af0 Mon Sep 17 00:00:00 2001 From: Vedantsahai18 Date: Mon, 13 Jan 2025 23:04:21 +0000 Subject: [PATCH 13/27] refactor: Lint agents-api (CI) --- agents-api/tests/test_docs_queries.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index 3abc91f2f..5326c7c4b 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -435,7 +435,10 @@ async def _(): # Multiple sentences ( "I love basketball especially Michael Jordan. LeBron James is also great.", - ["basketball OR lebron james OR michael jordan", "LeBron James OR Michael Jordan OR basketball"], + [ + "basketball OR lebron james OR michael jordan", + "LeBron James OR Michael Jordan OR basketball", + ], ), # Quoted phrases ( @@ -467,9 +470,9 @@ async def _(): print(f"Generated query: '{result}'") print(f"Expected: '{expected_output}'\n") if isinstance(expected_output, list): - assert any(result.lower() == expected_output.lower() for expected_output in expected_output), ( - f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" - ) + assert any( + result.lower() == expected_output.lower() for expected_output in expected_output + ), f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" else: assert result.lower() == expected_output.lower(), ( f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" From 7c9962813555b78ea6060dd699c0830d61fdb9a9 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Mon, 13 Jan 2025 18:31:29 -0500 Subject: [PATCH 14/27] chore: test fix + add embedding vector generation based on the confidence --- agents-api/tests/fixtures.py | 29 +++++++++++---- agents-api/tests/test_docs_queries.py | 51 ++------------------------- agents-api/tests/utils.py | 45 +++++++++++++++++++++++ 3 files changed, 71 insertions(+), 54 deletions(-) diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py index a5dc7dc32..919403c1f 100644 --- a/agents-api/tests/fixtures.py +++ b/agents-api/tests/fixtures.py @@ -45,6 +45,7 @@ ) from .utils import ( patch_embed_acompletion as patch_embed_acompletion_ctx, + make_vector_with_similarity, ) @@ -164,6 +165,10 @@ async def test_doc(dsn=pg_dsn, developer=test_developer, agent=test_agent): @fixture(scope="test") async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test_doc): pool = await create_db_pool(dsn=dsn) + embedding_with_confidence_0 = make_vector_with_similarity(d=0.0) + embedding_with_confidence_05 = make_vector_with_similarity(d=0.5) + embedding_with_confidence_05_neg = make_vector_with_similarity(d=-0.5) + embedding_with_confidence_1_neg = make_vector_with_similarity(d=-1.0) await pool.execute( """ INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) @@ -175,7 +180,7 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test f"[{', '.join([str(x) for x in [1.0] * 1024])}]", ) - # Insert embedding with random values between 0.3 and 0.7 + # Insert embedding with confidence 0 with respect to unit vector await pool.execute( """ INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) @@ -184,10 +189,10 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test developer.id, doc.id, "Test content 1", - f"[{', '.join([str(0.3 + 0.4 * (i % 3) / 2) for i in range(1024)])}]", + f"[{', '.join([str(x) for x in embedding_with_confidence_0])}]", ) - # Insert embedding with random values between -0.8 and 0.8 + # Insert embedding with confidence 0.5 with respect to unit vector await pool.execute( """ INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) @@ -196,10 +201,10 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test developer.id, doc.id, "Test content 2", - f"[{', '.join([str(-0.8 + 1.6 * (i % 5) / 4) for i in range(1024)])}]", + f"[{', '.join([str(x) for x in embedding_with_confidence_05])}]", ) - # Insert embedding with alternating -1 and 1 + # Insert embedding with confidence -0.5 with respect to unit vector await pool.execute( """ INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) @@ -208,7 +213,19 @@ async def test_doc_with_embedding(dsn=pg_dsn, developer=test_developer, doc=test developer.id, doc.id, "Test content 3", - f"[{', '.join([str(-1 if i % 2 else 1) for i in range(1024)])}]", + f"[{', '.join([str(x) for x in embedding_with_confidence_05_neg])}]", + ) + + # Insert embedding with confidence -1 with respect to unit vector + await pool.execute( + """ + INSERT INTO docs_embeddings_store (developer_id, doc_id, index, chunk_seq, chunk, embedding) + VALUES ($1, $2, 0, 4, $3, $4) + """, + developer.id, + doc.id, + "Test content 4", + f"[{', '.join([str(x) for x in embedding_with_confidence_1_neg])}]", ) yield await get_doc(developer_id=developer.id, doc_id=doc.id, connection_pool=pool) diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index 5326c7c4b..fd0246e1c 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -19,55 +19,9 @@ test_user, ) -EMBEDDING_SIZE: int = 1024 - -import math - - -def make_vector_with_similarity(n: int, d: float): - """ - Returns a list `v` of length `n` such that the cosine similarity - between `v` and the all-ones vector of length `n` is approximately d. - """ - if not -1.0 <= d <= 1.0: - msg = "d must lie in [-1, 1]." - raise ValueError(msg) - - # Handle special cases exactly: - if abs(d - 1.0) < 1e-12: # d ~ +1 - return [1.0] * n - if abs(d + 1.0) < 1e-12: # d ~ -1 - return [-1.0] * n - if abs(d) < 1e-12: # d ~ 0 - v = [0.0] * n - if n >= 2: - v[0] = 1.0 - v[1] = -1.0 - return v - - sign_d = 1.0 if d >= 0 else -1.0 +from .utils import make_vector_with_similarity - # Base part: sign(d)*[1,1,...,1] - base = [sign_d] * n - - # Orthogonal unit vector u with sum(u)=0; for simplicity: - # u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0] - u = [0.0] * n - if n >= 2: - u[0] = 1.0 / math.sqrt(2) - u[1] = -1.0 / math.sqrt(2) - # (if n=1, there's no truly orthogonal vector to [1], so skip) - - # Solve for alpha: - # alpha^2 = n*(1 - d^2)/d^2 - alpha = math.sqrt(n * (1 - d * d)) / abs(d) - - # Construct v - v = [0.0] * n - for i in range(n): - v[i] = base[i] + alpha * u[i] - - return v +EMBEDDING_SIZE: int = 1024 @test("query: create user doc") @@ -438,6 +392,7 @@ async def _(): [ "basketball OR lebron james OR michael jordan", "LeBron James OR Michael Jordan OR basketball", + "Michael Jordan OR basketball OR LeBron James" ], ), # Quoted phrases diff --git a/agents-api/tests/utils.py b/agents-api/tests/utils.py index 05544e048..95f0194ed 100644 --- a/agents-api/tests/utils.py +++ b/agents-api/tests/utils.py @@ -1,6 +1,7 @@ import asyncio import logging import os +import math import subprocess from contextlib import asynccontextmanager, contextmanager from unittest.mock import patch @@ -18,6 +19,50 @@ # Replicated here to prevent circular import EMBEDDING_SIZE: int = 1024 +def make_vector_with_similarity(n: int = EMBEDDING_SIZE, d: float = 0.5): + """ + Returns a list `v` of length `n` such that the cosine similarity + between `v` and the all-ones vector of length `n` is approximately d. + """ + if not -1.0 <= d <= 1.0: + msg = "d must lie in [-1, 1]." + raise ValueError(msg) + + # Handle special cases exactly: + if abs(d - 1.0) < 1e-12: # d ~ +1 + return [1.0] * n + if abs(d + 1.0) < 1e-12: # d ~ -1 + return [-1.0] * n + if abs(d) < 1e-12: # d ~ 0 + v = [0.0] * n + if n >= 2: + v[0] = 1.0 + v[1] = -1.0 + return v + + sign_d = 1.0 if d >= 0 else -1.0 + + # Base part: sign(d)*[1,1,...,1] + base = [sign_d] * n + + # Orthogonal unit vector u with sum(u)=0; for simplicity: + # u = [1/sqrt(2), -1/sqrt(2), 0, 0, ..., 0] + u = [0.0] * n + if n >= 2: + u[0] = 1.0 / math.sqrt(2) + u[1] = -1.0 / math.sqrt(2) + # (if n=1, there's no truly orthogonal vector to [1], so skip) + + # Solve for alpha: + # alpha^2 = n*(1 - d^2)/d^2 + alpha = math.sqrt(n * (1 - d * d)) / abs(d) + + # Construct v + v = [0.0] * n + for i in range(n): + v[i] = base[i] + alpha * u[i] + + return v @asynccontextmanager async def patch_testing_temporal(): From cb86135727d8c2f64060a296b4955c0a6ecf8ca1 Mon Sep 17 00:00:00 2001 From: Vedantsahai18 Date: Mon, 13 Jan 2025 23:32:20 +0000 Subject: [PATCH 15/27] refactor: Lint agents-api (CI) --- agents-api/tests/fixtures.py | 2 +- agents-api/tests/test_docs_queries.py | 4 +--- agents-api/tests/utils.py | 4 +++- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/agents-api/tests/fixtures.py b/agents-api/tests/fixtures.py index 919403c1f..5b0ff68cc 100644 --- a/agents-api/tests/fixtures.py +++ b/agents-api/tests/fixtures.py @@ -42,10 +42,10 @@ from .utils import ( get_localstack, get_pg_dsn, + make_vector_with_similarity, ) from .utils import ( patch_embed_acompletion as patch_embed_acompletion_ctx, - make_vector_with_similarity, ) diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index fd0246e1c..feec3a6c2 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -19,8 +19,6 @@ test_user, ) -from .utils import make_vector_with_similarity - EMBEDDING_SIZE: int = 1024 @@ -392,7 +390,7 @@ async def _(): [ "basketball OR lebron james OR michael jordan", "LeBron James OR Michael Jordan OR basketball", - "Michael Jordan OR basketball OR LeBron James" + "Michael Jordan OR basketball OR LeBron James", ], ), # Quoted phrases diff --git a/agents-api/tests/utils.py b/agents-api/tests/utils.py index 95f0194ed..45489befd 100644 --- a/agents-api/tests/utils.py +++ b/agents-api/tests/utils.py @@ -1,7 +1,7 @@ import asyncio import logging -import os import math +import os import subprocess from contextlib import asynccontextmanager, contextmanager from unittest.mock import patch @@ -19,6 +19,7 @@ # Replicated here to prevent circular import EMBEDDING_SIZE: int = 1024 + def make_vector_with_similarity(n: int = EMBEDDING_SIZE, d: float = 0.5): """ Returns a list `v` of length `n` such that the cosine similarity @@ -64,6 +65,7 @@ def make_vector_with_similarity(n: int = EMBEDDING_SIZE, d: float = 0.5): return v + @asynccontextmanager async def patch_testing_temporal(): # Set log level to ERROR to avoid spamming the console From 61d32bd1b68add043640e77db54b497f920014f4 Mon Sep 17 00:00:00 2001 From: Ahmad Haidar Date: Tue, 14 Jan 2025 15:27:13 +0300 Subject: [PATCH 16/27] fix(agents-api): Configure spacy for postgresql --- agents-api/agents_api/common/nlp.py | 255 ++++++++------------------ agents-api/tests/test_docs_queries.py | 20 +- 2 files changed, 86 insertions(+), 189 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index be86d8936..62895f7f9 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -29,67 +29,33 @@ }, ) - -# Singleton PhraseMatcher for better performance -class KeywordMatcher: - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - cls._instance.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") - cls._instance.batch_size = 1000 # Adjust based on memory constraints - cls._instance.patterns_cache = {} - return cls._instance - - @lru_cache(maxsize=10000) - def _create_pattern(self, text: str) -> Doc: - return nlp.make_doc(text) - - def find_matches(self, doc: Doc, keywords: list[str]) -> dict[str, list[int]]: - """Batch process keywords for better performance.""" - keyword_positions = defaultdict(list) - - # Process keywords in batches to avoid memory issues - for i in range(0, len(keywords), self.batch_size): - batch = keywords[i : i + self.batch_size] - patterns = [self._create_pattern(kw) for kw in batch] - - # Clear previous patterns and add new batch - if "KEYWORDS" in self.matcher: - self.matcher.remove("KEYWORDS") - self.matcher.add("KEYWORDS", patterns) - - # Find matches for this batch - matches = self.matcher(doc) - for match_id, start, end in matches: - span_text = doc[start:end].text - normalized = WHITESPACE_RE.sub(" ", span_text).lower().strip() - keyword_positions[normalized].append(start) - - return keyword_positions - - -# Initialize global matcher -keyword_matcher = KeywordMatcher() - - @lru_cache(maxsize=10000) def clean_keyword(kw: str) -> str: """Cache cleaned keywords for reuse.""" return NON_ALPHANUM_RE.sub("", kw).strip() -def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]: +def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { - "DATE", - "TIME", - "PERCENT", - "MONEY", - "QUANTITY", - "ORDINAL", - "CARDINAL", + "DATE", # Absolute or relative dates or periods. + "TIME", # Times smaller than a day. + "PERCENT", # Percentage, including ”%“. + "MONEY", # Monetary values, including unit. + "QUANTITY", # Measurements, as of weight or distance. + "ORDINAL", # “first”, “second”, etc. + "CARDINAL", # Numerals that do not fall under another type. + # "PERSON", # People, including fictional. + # "NORP", # Nationalities or religious or political groups. + # "FAC", # Buildings, airports, highways, bridges, etc. + # "ORG", # Companies, agencies, institutions, etc. + # "GPE", # Countries, cities, states. + # "LOC", # Non-GPE locations, mountain ranges, bodies of water. + # "PRODUCT", # Objects, vehicles, foods, etc. (Not services.) + # "EVENT", # Named hurricanes, battles, wars, sports events, etc. + # "WORK_OF_ART", # Titles of books, songs, etc. + # "LAW", # Named documents made into laws. + # "LANGUAGE", # Any named language. } # Extract and filter spans in a single pass @@ -104,8 +70,12 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str] # Process spans efficiently and filter out spans that are entirely stopwords keywords = [] + ent_keywords = [] seen_texts = set() + # Convert ent_spans to set for faster lookup + ent_spans_set = set(ent_spans) + for span in all_spans: # Skip if all tokens in span are stopwords if all(token.is_stop for token in span): @@ -119,79 +89,30 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str] continue seen_texts.add(lower_text) - keywords.append(text) + ent_keywords.append(text) if span in ent_spans_set else keywords.append(text) + # Normalize keywords by replacing multiple spaces with single space and stripping + normalized_ent_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in ent_keywords] normalized_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in keywords] # Count frequencies efficiently + ent_freq = Counter(normalized_ent_keywords) freq = Counter(normalized_keywords) - top_keywords = [kw for kw, _ in freq.most_common(top_n)] + + + top_keywords = [kw for kw, _ in ent_freq.most_common(top_n)] + remaining_slots = max(0, top_n - len(top_keywords)) + top_keywords += [kw for kw, _ in freq.most_common(remaining_slots)] if clean: return [clean_keyword(kw) for kw in top_keywords] return top_keywords -def find_proximity_groups( - keywords: list[str], keyword_positions: dict[str, list[int]], n: int = 10 -) -> list[set[str]]: - """Optimized proximity grouping using sorted positions.""" - # Early return for single or no keywords - if len(keywords) <= 1: - return [{kw} for kw in keywords] - - # Create flat list of positions for efficient processing - positions: list[tuple[int, str]] = [ - (pos, kw) for kw in keywords for pos in keyword_positions[kw] - ] - - # Sort positions once - positions.sort() - - # Initialize Union-Find with path compression and union by rank - parent = {kw: kw for kw in keywords} - rank = dict.fromkeys(keywords, 0) - - def find(u: str) -> str: - if parent[u] != u: - parent[u] = find(parent[u]) - return parent[u] - - def union(u: str, v: str) -> None: - u_root, v_root = find(u), find(v) - if u_root != v_root: - if rank[u_root] < rank[v_root]: - u_root, v_root = v_root, u_root - parent[v_root] = u_root - if rank[u_root] == rank[v_root]: - rank[u_root] += 1 - - # Use sliding window for proximity checking - window = [] - for pos, kw in positions: - # Remove positions outside window - while window and pos - window[0][0] > n: - window.pop(0) - - # Union with all keywords in window - for _, w_kw in window: - union(kw, w_kw) - - window.append((pos, kw)) - - # Group keywords efficiently - groups = defaultdict(set) - for kw in keywords: - root = find(kw) - groups[root].add(kw) - - return list(groups.values()) - - @lru_cache(maxsize=1000) def text_to_tsvector_query( - paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1 + paragraph: str, top_n: int = 25, min_keywords: int = 1 ) -> str: """ Extracts meaningful keywords/phrases from text and joins them with OR. @@ -203,7 +124,6 @@ def text_to_tsvector_query( Args: paragraph (str): The input text to process top_n (int): Number of top keywords to extract per sentence - proximity_n (int): The proximity window for grouping related keywords min_keywords (int): Minimum number of keywords required Returns: @@ -223,71 +143,54 @@ def text_to_tsvector_query( if len(keywords) < min_keywords: continue - # Find keyword positions - keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) - if not keyword_positions: - continue - - # Group related keywords by proximity - groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - - # Add each group as a single term to our set - for group in groups: - if len(group) > 1: - # Sort by length descending to prioritize longer phrases - sorted_group = sorted(group, key=len, reverse=True) - # For truly proximate multi-word groups, group words - queries.add(" OR ".join(sorted_group)) - else: - # For non-proximate words or single words, add them separately - queries.update(group) + queries.add(" OR ".join(keywords)) # Join all terms with " OR " return " OR ".join(queries) if queries else "" -def batch_text_to_tsvector_queries( - paragraphs: list[str], - top_n: int = 10, - proximity_n: int = 10, - min_keywords: int = 1, - n_process: int = 1, -) -> list[str]: - """ - Processes multiple paragraphs using nlp.pipe for better performance. - - Args: - paragraphs (list[str]): List of paragraphs to process - top_n (int): Number of top keywords to include per paragraph - - Returns: - list[str]: List of tsquery strings - """ - results = [] - - for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): - queries = set() # Use set to avoid duplicates - for sent in doc.sents: - sent_doc = sent.as_doc() - keywords = extract_keywords(sent_doc, top_n) - if len(keywords) < min_keywords: - continue - keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) - if not keyword_positions: - continue - groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - # Add each group as a single term to our set - for group in groups: - if len(group) > 1: - # Sort by length descending to prioritize longer phrases - sorted_group = sorted(group, key=len, reverse=True) - # For truly proximate multi-word groups, group words - queries.add(" OR ".join(sorted_group)) - else: - # For non-proximate words or single words, add them separately - queries.update(group) - - # Join all terms with " OR " - results.append(" OR ".join(queries) if queries else "") - - return results +# def batch_text_to_tsvector_queries( +# paragraphs: list[str], +# top_n: int = 10, +# proximity_n: int = 10, +# min_keywords: int = 1, +# n_process: int = 1, +# ) -> list[str]: +# """ +# Processes multiple paragraphs using nlp.pipe for better performance. + +# Args: +# paragraphs (list[str]): List of paragraphs to process +# top_n (int): Number of top keywords to include per paragraph + +# Returns: +# list[str]: List of tsquery strings +# """ +# results = [] + +# for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): +# queries = set() # Use set to avoid duplicates +# for sent in doc.sents: +# sent_doc = sent.as_doc() +# keywords = extract_keywords(sent_doc, top_n) +# if len(keywords) < min_keywords: +# continue +# keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) +# if not keyword_positions: +# continue +# groups = find_proximity_groups(keywords, keyword_positions, proximity_n) +# # Add each group as a single term to our set +# for group in groups: +# if len(group) > 1: +# # Sort by length descending to prioritize longer phrases +# sorted_group = sorted(group, key=len, reverse=True) +# # For truly proximate multi-word groups, group words +# queries.add(" OR ".join(sorted_group)) +# else: +# # For non-proximate words or single words, add them separately +# queries.update(group) + +# # Join all terms with " OR " +# results.append(" OR ".join(queries) if queries else "") + +# return results diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index feec3a6c2..fea7f4fbf 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -387,11 +387,7 @@ async def _(): # Multiple sentences ( "I love basketball especially Michael Jordan. LeBron James is also great.", - [ - "basketball OR lebron james OR michael jordan", - "LeBron James OR Michael Jordan OR basketball", - "Michael Jordan OR basketball OR LeBron James", - ], + "basketball OR lebron james OR michael jordan", ), # Quoted phrases ( @@ -422,14 +418,12 @@ async def _(): result = text_to_tsvector_query(input_text) print(f"Generated query: '{result}'") print(f"Expected: '{expected_output}'\n") - if isinstance(expected_output, list): - assert any( - result.lower() == expected_output.lower() for expected_output in expected_output - ), f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" - else: - assert result.lower() == expected_output.lower(), ( - f"Expected '{expected_output}' but got '{result}' for input '{input_text}'" - ) + + result_terms = set(term.lower() for term in result.split(" OR ") if term) + expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term) + assert result_terms == expected_terms, ( + f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'" + ) # @test("query: search docs by embedding with different confidence levels") From 890880bb67b776c424624e910ca49c0a045ad45f Mon Sep 17 00:00:00 2001 From: Ahmad-mtos Date: Tue, 14 Jan 2025 12:28:09 +0000 Subject: [PATCH 17/27] refactor: Lint agents-api (CI) --- agents-api/agents_api/common/nlp.py | 24 ++++++++++-------------- agents-api/tests/test_docs_queries.py | 6 +++--- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 62895f7f9..26cba72ce 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -1,9 +1,8 @@ import re -from collections import Counter, defaultdict +from collections import Counter from functools import lru_cache import spacy -from spacy.matcher import PhraseMatcher from spacy.tokens import Doc from spacy.util import filter_spans @@ -29,6 +28,7 @@ }, ) + @lru_cache(maxsize=10000) def clean_keyword(kw: str) -> str: """Cache cleaned keywords for reuse.""" @@ -38,13 +38,13 @@ def clean_keyword(kw: str) -> str: def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { - "DATE", # Absolute or relative dates or periods. - "TIME", # Times smaller than a day. - "PERCENT", # Percentage, including ”%“. - "MONEY", # Monetary values, including unit. - "QUANTITY", # Measurements, as of weight or distance. - "ORDINAL", # “first”, “second”, etc. - "CARDINAL", # Numerals that do not fall under another type. + "DATE", # Absolute or relative dates or periods. + "TIME", # Times smaller than a day. + "PERCENT", # Percentage, including ”%“. + "MONEY", # Monetary values, including unit. + "QUANTITY", # Measurements, as of weight or distance. + "ORDINAL", # “first”, “second”, etc. + "CARDINAL", # Numerals that do not fall under another type. # "PERSON", # People, including fictional. # "NORP", # Nationalities or religious or political groups. # "FAC", # Buildings, airports, highways, bridges, etc. @@ -91,7 +91,6 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str] seen_texts.add(lower_text) ent_keywords.append(text) if span in ent_spans_set else keywords.append(text) - # Normalize keywords by replacing multiple spaces with single space and stripping normalized_ent_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in ent_keywords] normalized_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in keywords] @@ -100,7 +99,6 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str] ent_freq = Counter(normalized_ent_keywords) freq = Counter(normalized_keywords) - top_keywords = [kw for kw, _ in ent_freq.most_common(top_n)] remaining_slots = max(0, top_n - len(top_keywords)) top_keywords += [kw for kw, _ in freq.most_common(remaining_slots)] @@ -111,9 +109,7 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str] @lru_cache(maxsize=1000) -def text_to_tsvector_query( - paragraph: str, top_n: int = 25, min_keywords: int = 1 -) -> str: +def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int = 1) -> str: """ Extracts meaningful keywords/phrases from text and joins them with OR. diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index fea7f4fbf..d4d685c1d 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -418,9 +418,9 @@ async def _(): result = text_to_tsvector_query(input_text) print(f"Generated query: '{result}'") print(f"Expected: '{expected_output}'\n") - - result_terms = set(term.lower() for term in result.split(" OR ") if term) - expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term) + + result_terms = {term.lower() for term in result.split(" OR ") if term} + expected_terms = {term.lower() for term in expected_output.split(" OR ") if term} assert result_terms == expected_terms, ( f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'" ) From 27ed1f4a2944f7afb5e5a1ab483693973187297e Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Tue, 14 Jan 2025 17:38:59 -0500 Subject: [PATCH 18/27] chore: misc refactor --- agents-api/agents_api/common/nlp.py | 30 ++++++----------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index be86d8936..6850be6a9 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -263,31 +263,13 @@ def batch_text_to_tsvector_queries( Returns: list[str]: List of tsquery strings """ - results = [] + # Use a set to avoid duplicates + results = set() for doc in nlp.pipe(paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process): - queries = set() # Use set to avoid duplicates - for sent in doc.sents: - sent_doc = sent.as_doc() - keywords = extract_keywords(sent_doc, top_n) - if len(keywords) < min_keywords: - continue - keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) - if not keyword_positions: - continue - groups = find_proximity_groups(keywords, keyword_positions, proximity_n) - # Add each group as a single term to our set - for group in groups: - if len(group) > 1: - # Sort by length descending to prioritize longer phrases - sorted_group = sorted(group, key=len, reverse=True) - # For truly proximate multi-word groups, group words - queries.add(" OR ".join(sorted_group)) - else: - # For non-proximate words or single words, add them separately - queries.update(group) - - # Join all terms with " OR " - results.append(" OR ".join(queries) if queries else "") + # Generate tsquery string for each paragraph + queries = text_to_tsvector_query(doc, top_n, proximity_n, min_keywords) + # Add to results set + results.add(queries) return results From 6a07a54b47b597e922bee597932f6ec2f006790e Mon Sep 17 00:00:00 2001 From: Vedant Sahai Date: Tue, 14 Jan 2025 17:40:03 -0500 Subject: [PATCH 19/27] Update agents-api/agents_api/common/nlp.py Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --- agents-api/agents_api/common/nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 6850be6a9..e01af83b6 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -272,4 +272,4 @@ def batch_text_to_tsvector_queries( # Add to results set results.add(queries) - return results + return list(results) From 68a7a05b4557e8627d5ab6cdce94f09d085924cc Mon Sep 17 00:00:00 2001 From: Ahmad Haidar Date: Wed, 15 Jan 2025 11:40:32 +0300 Subject: [PATCH 20/27] fix(agents-api): add split chunks option + nlp tests --- agents-api/agents_api/common/nlp.py | 23 +++- agents-api/tests/test_docs_queries.py | 57 ---------- agents-api/tests/test_nlp_utilities.py | 147 +++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 62 deletions(-) create mode 100644 agents-api/tests/test_nlp_utilities.py diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 26cba72ce..c49928508 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -9,6 +9,7 @@ # Precompile regex patterns WHITESPACE_RE = re.compile(r"\s+") NON_ALPHANUM_RE = re.compile(r"[^\w\s\-_]+") +LONE_HYPHEN_RE = re.compile(r'\s*-\s*(?!\w)|(? str: """Cache cleaned keywords for reuse.""" - return NON_ALPHANUM_RE.sub("", kw).strip() + # First remove non-alphanumeric chars (except whitespace, hyphens, underscores) + cleaned = NON_ALPHANUM_RE.sub("", kw).strip() + # Replace lone hyphens with spaces + cleaned = LONE_HYPHEN_RE.sub(" ", cleaned) + # Clean up any resulting multiple spaces + cleaned = WHITESPACE_RE.sub(" ", cleaned).strip() + return cleaned -def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str]: +def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { "DATE", # Absolute or relative dates or periods. @@ -95,6 +102,9 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str] normalized_ent_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in ent_keywords] normalized_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in keywords] + if split_chunks: + normalized_keywords = [word for kw in normalized_keywords for word in kw.split()] + # Count frequencies efficiently ent_freq = Counter(normalized_ent_keywords) freq = Counter(normalized_keywords) @@ -109,7 +119,9 @@ def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True) -> list[str] @lru_cache(maxsize=1000) -def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int = 1) -> str: +def text_to_tsvector_query( + paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False +) -> str: """ Extracts meaningful keywords/phrases from text and joins them with OR. @@ -121,6 +133,7 @@ def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int = paragraph (str): The input text to process top_n (int): Number of top keywords to extract per sentence min_keywords (int): Minimum number of keywords required + split_chunks (bool): If True, breaks multi-word noun chunks into individual words Returns: str: Keywords/phrases joined by OR @@ -135,11 +148,11 @@ def text_to_tsvector_query(paragraph: str, top_n: int = 25, min_keywords: int = sent_doc = sent.as_doc() # Extract keywords - keywords = extract_keywords(sent_doc, top_n) + keywords = extract_keywords(sent_doc, top_n, split_chunks=split_chunks) if len(keywords) < min_keywords: continue - queries.add(" OR ".join(keywords)) + queries.update(keywords) # Join all terms with " OR " return " OR ".join(queries) if queries else "" diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index d4d685c1d..82147a398 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -369,63 +369,6 @@ async def _( assert result[0].metadata is not None -@test("utility: test for text_to_tsvector_query") -async def _(): - test_cases = [ - # Single words - ("test", "test"), - # Multiple words in single sentence - ( - "quick brown fox", - "quick brown fox", # Now kept as a single phrase due to proximity - ), - # Technical terms and phrases - ( - "Machine Learning algorithm", - "machine learning algorithm", # Common technical phrase - ), - # Multiple sentences - ( - "I love basketball especially Michael Jordan. LeBron James is also great.", - "basketball OR lebron james OR michael jordan", - ), - # Quoted phrases - ( - '"quick brown fox"', - "quick brown fox", # Quotes removed, phrase kept together - ), - ('Find "machine learning" algorithms', "machine learning"), - # Multiple quoted phrases - ('"data science" and "machine learning"', "machine learning OR data science"), - # Edge cases - ("", ""), - ( - "the and or", - "", # All stop words should result in empty string - ), - ( - "a", - "", # Single stop word should result in empty string - ), - ("X", "X"), - # Empty quotes - ('""', ""), - ('test "" phrase', "phrase OR test"), - ] - - for input_text, expected_output in test_cases: - print(f"Input: '{input_text}'") - result = text_to_tsvector_query(input_text) - print(f"Generated query: '{result}'") - print(f"Expected: '{expected_output}'\n") - - result_terms = {term.lower() for term in result.split(" OR ") if term} - expected_terms = {term.lower() for term in expected_output.split(" OR ") if term} - assert result_terms == expected_terms, ( - f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'" - ) - - # @test("query: search docs by embedding with different confidence levels") # async def _( # dsn=pg_dsn, agent=test_agent, developer=test_developer, doc=test_doc_with_embedding diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py new file mode 100644 index 000000000..63d7f126c --- /dev/null +++ b/agents-api/tests/test_nlp_utilities.py @@ -0,0 +1,147 @@ +from agents_api.common.nlp import text_to_tsvector_query, clean_keyword, extract_keywords +import spacy + +from ward import test + +@test("utility: clean_keyword") +async def _(): + assert clean_keyword("Hello, World!") == "Hello World" + + # Basic cleaning + # assert clean_keyword("test@example.com") == "test example com" + assert clean_keyword("user-name_123") == "user-name_123" + assert clean_keyword(" spaces ") == "spaces" + + # Special characters + assert clean_keyword("$price: 100%") == "price 100" + assert clean_keyword("#hashtag!") == "hashtag" + + # Multiple spaces and punctuation + assert clean_keyword("multiple, spaces...") == "multiple spaces" + + # Empty and whitespace + assert clean_keyword("") == "" + assert clean_keyword(" ") == "" + + assert clean_keyword("- try") == "try" + +@test("utility: extract_keywords") +async def _(): + nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"]) + doc = nlp("John Doe is a software engineer at Google.") + assert set(extract_keywords(doc)) == {"John Doe", "a software engineer", "Google"} + +@test("utility: text_to_tsvector_query - split_chunks=False") +async def _(): + test_cases = [ + # Single words + ("test", "test"), + # Multiple words in single sentence + ( + "quick brown fox", + "quick brown fox", # Now kept as a single phrase due to proximity + ), + # Technical terms and phrases + ( + "Machine Learning algorithm", + "machine learning algorithm", # Common technical phrase + ), + # Multiple sentences + ( + "I love basketball especially Michael Jordan. LeBron James is also great.", + "basketball OR lebron james OR michael jordan", + ), + # Quoted phrases + ( + '"quick brown fox"', + "quick brown fox", # Quotes removed, phrase kept together + ), + ('Find "machine learning" algorithms', "machine learning"), + # Multiple quoted phrases + ('"data science" and "machine learning"', "machine learning OR data science"), + # Edge cases + ("", ""), + ( + "the and or", + "", # All stop words should result in empty string + ), + ( + "a", + "", # Single stop word should result in empty string + ), + ("X", "X"), + # Empty quotes + ('""', ""), + ('test "" phrase', "phrase OR test"), + ("John Doe is a software engineer at Google.", "google OR john doe OR a software engineer"), + ("- google", "google"), + ] + + for input_text, expected_output in test_cases: + print(f"Input: '{input_text}'") + result = text_to_tsvector_query(input_text, split_chunks=False) + print(f"Generated query: '{result}'") + print(f"Expected: '{expected_output}'\n") + + result_terms = set(term.lower() for term in result.split(" OR ") if term) + expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term) + assert result_terms == expected_terms, ( + f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'" + ) + +@test("utility: text_to_tsvector_query - split_chunks=True") +async def _(): + test_cases = [ + # Single words + ("test", "test"), + # Multiple words in single sentence + ( + "quick brown fox", + "quick OR brown OR fox", # Now kept as a single phrase due to proximity + ), + # Technical terms and phrases + ( + "Machine Learning algorithm", + "machine OR learning OR algorithm", # Common technical phrase + ), + # Multiple sentences + ( + "I love basketball especially Michael Jordan. LeBron James is also great.", + "basketball OR lebron james OR michael jordan", + ), + # Quoted phrases + ( + '"quick brown fox"', + "quick OR brown OR fox", # Quotes removed, phrase kept together + ), + ('Find "machine learning" algorithms', "machine OR learning"), + # Multiple quoted phrases + ('"data science" and "machine learning"', "machine OR learning OR data OR science"), + # Edge cases + ("", ""), + ( + "the and or", + "", # All stop words should result in empty string + ), + ( + "a", + "", # Single stop word should result in empty string + ), + ("X", "X"), + # Empty quotes + ('""', ""), + ('test "" phrase', "phrase OR test"), + ("John Doe is a software engineer at Google.", "google OR john doe OR a OR software OR engineer"), + ] + + for input_text, expected_output in test_cases: + print(f"Input: '{input_text}'") + result = text_to_tsvector_query(input_text, split_chunks=True) + print(f"Generated query: '{result}'") + print(f"Expected: '{expected_output}'\n") + + result_terms = set(term.lower() for term in result.split(" OR ") if term) + expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term) + assert result_terms == expected_terms, ( + f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'" + ) \ No newline at end of file From 3fa200c74f85f35c66d8019b5b4daf7eca397f17 Mon Sep 17 00:00:00 2001 From: Ahmad-mtos Date: Wed, 15 Jan 2025 08:41:26 +0000 Subject: [PATCH 21/27] refactor: Lint agents-api (CI) --- agents-api/agents_api/common/nlp.py | 9 +++--- agents-api/tests/test_docs_queries.py | 1 - agents-api/tests/test_nlp_utilities.py | 39 ++++++++++++++++---------- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index c49928508..6c5f49f74 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -9,7 +9,7 @@ # Precompile regex patterns WHITESPACE_RE = re.compile(r"\s+") NON_ALPHANUM_RE = re.compile(r"[^\w\s\-_]+") -LONE_HYPHEN_RE = re.compile(r'\s*-\s*(?!\w)|(? str: # Replace lone hyphens with spaces cleaned = LONE_HYPHEN_RE.sub(" ", cleaned) # Clean up any resulting multiple spaces - cleaned = WHITESPACE_RE.sub(" ", cleaned).strip() - return cleaned + return WHITESPACE_RE.sub(" ", cleaned).strip() -def extract_keywords(doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False) -> list[str]: +def extract_keywords( + doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False +) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { "DATE", # Absolute or relative dates or periods. diff --git a/agents-api/tests/test_docs_queries.py b/agents-api/tests/test_docs_queries.py index 82147a398..7782b3bf7 100644 --- a/agents-api/tests/test_docs_queries.py +++ b/agents-api/tests/test_docs_queries.py @@ -1,6 +1,5 @@ from agents_api.autogen.openapi_model import CreateDocRequest from agents_api.clients.pg import create_db_pool -from agents_api.common.nlp import text_to_tsvector_query from agents_api.queries.docs.create_doc import create_doc from agents_api.queries.docs.delete_doc import delete_doc from agents_api.queries.docs.get_doc import get_doc diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py index 63d7f126c..82c22911a 100644 --- a/agents-api/tests/test_nlp_utilities.py +++ b/agents-api/tests/test_nlp_utilities.py @@ -1,36 +1,38 @@ -from agents_api.common.nlp import text_to_tsvector_query, clean_keyword, extract_keywords import spacy - +from agents_api.common.nlp import clean_keyword, extract_keywords, text_to_tsvector_query from ward import test + @test("utility: clean_keyword") async def _(): assert clean_keyword("Hello, World!") == "Hello World" - + # Basic cleaning # assert clean_keyword("test@example.com") == "test example com" assert clean_keyword("user-name_123") == "user-name_123" assert clean_keyword(" spaces ") == "spaces" - + # Special characters assert clean_keyword("$price: 100%") == "price 100" assert clean_keyword("#hashtag!") == "hashtag" - + # Multiple spaces and punctuation assert clean_keyword("multiple, spaces...") == "multiple spaces" - + # Empty and whitespace assert clean_keyword("") == "" assert clean_keyword(" ") == "" assert clean_keyword("- try") == "try" + @test("utility: extract_keywords") async def _(): nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"]) doc = nlp("John Doe is a software engineer at Google.") assert set(extract_keywords(doc)) == {"John Doe", "a software engineer", "Google"} + @test("utility: text_to_tsvector_query - split_chunks=False") async def _(): test_cases = [ @@ -73,7 +75,10 @@ async def _(): # Empty quotes ('""', ""), ('test "" phrase', "phrase OR test"), - ("John Doe is a software engineer at Google.", "google OR john doe OR a software engineer"), + ( + "John Doe is a software engineer at Google.", + "google OR john doe OR a software engineer", + ), ("- google", "google"), ] @@ -82,13 +87,14 @@ async def _(): result = text_to_tsvector_query(input_text, split_chunks=False) print(f"Generated query: '{result}'") print(f"Expected: '{expected_output}'\n") - - result_terms = set(term.lower() for term in result.split(" OR ") if term) - expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term) + + result_terms = {term.lower() for term in result.split(" OR ") if term} + expected_terms = {term.lower() for term in expected_output.split(" OR ") if term} assert result_terms == expected_terms, ( f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'" ) + @test("utility: text_to_tsvector_query - split_chunks=True") async def _(): test_cases = [ @@ -131,7 +137,10 @@ async def _(): # Empty quotes ('""', ""), ('test "" phrase', "phrase OR test"), - ("John Doe is a software engineer at Google.", "google OR john doe OR a OR software OR engineer"), + ( + "John Doe is a software engineer at Google.", + "google OR john doe OR a OR software OR engineer", + ), ] for input_text, expected_output in test_cases: @@ -139,9 +148,9 @@ async def _(): result = text_to_tsvector_query(input_text, split_chunks=True) print(f"Generated query: '{result}'") print(f"Expected: '{expected_output}'\n") - - result_terms = set(term.lower() for term in result.split(" OR ") if term) - expected_terms = set(term.lower() for term in expected_output.split(" OR ") if term) + + result_terms = {term.lower() for term in result.split(" OR ") if term} + expected_terms = {term.lower() for term in expected_output.split(" OR ") if term} assert result_terms == expected_terms, ( f"Expected terms {expected_terms} but got {result_terms} for input '{input_text}'" - ) \ No newline at end of file + ) From 2c25490dbc04f34e24d062e1fdb5bf0fc55cfd1e Mon Sep 17 00:00:00 2001 From: Ahmad Haidar Date: Wed, 15 Jan 2025 11:45:15 +0300 Subject: [PATCH 22/27] chore(agents-api): utilize ``text_to_tsvector_query`` in search queries --- agents-api/agents_api/queries/docs/search_docs_by_text.py | 3 ++- agents-api/agents_api/queries/docs/search_docs_hybrid.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py index b1758625b..0a48123d2 100644 --- a/agents-api/agents_api/queries/docs/search_docs_by_text.py +++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py @@ -8,6 +8,7 @@ from ...common.utils.db_exceptions import common_db_exceptions from ..utils import pg_query, rewrap_exceptions, wrap_in_class from .utils import transform_to_doc_reference +from ...common.nlp import text_to_tsvector_query # Raw query for text search search_docs_text_query = """ @@ -61,7 +62,7 @@ async def search_docs_by_text( owner_types: list[str] = [owner[0] for owner in owners] owner_ids: list[str] = [str(owner[1]) for owner in owners] # Pre-process rawtext query - # query = text_to_tsvector_query(query) + query = text_to_tsvector_query(query) return ( search_docs_text_query, diff --git a/agents-api/agents_api/queries/docs/search_docs_hybrid.py b/agents-api/agents_api/queries/docs/search_docs_hybrid.py index fe68bc075..d33347db7 100644 --- a/agents-api/agents_api/queries/docs/search_docs_hybrid.py +++ b/agents-api/agents_api/queries/docs/search_docs_hybrid.py @@ -6,6 +6,7 @@ from ...autogen.openapi_model import DocReference from ...common.utils.db_exceptions import common_db_exceptions +from ...common.nlp import text_to_tsvector_query from ..utils import ( pg_query, rewrap_exceptions, @@ -81,6 +82,9 @@ async def search_docs_hybrid( owner_types: list[str] = [owner[0] for owner in owners] owner_ids: list[str] = [str(owner[1]) for owner in owners] + # Pre-process rawtext query + text_query = text_to_tsvector_query(text_query) + return ( search_docs_hybrid_query, [ From 363c7c63cdcd2357bd4f613c4c3775002b9af355 Mon Sep 17 00:00:00 2001 From: Ahmad-mtos Date: Wed, 15 Jan 2025 08:46:07 +0000 Subject: [PATCH 23/27] refactor: Lint agents-api (CI) --- agents-api/agents_api/queries/docs/search_docs_by_text.py | 2 +- agents-api/agents_api/queries/docs/search_docs_hybrid.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py index 0a48123d2..b98906466 100644 --- a/agents-api/agents_api/queries/docs/search_docs_by_text.py +++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py @@ -5,10 +5,10 @@ from fastapi import HTTPException from ...autogen.openapi_model import DocReference +from ...common.nlp import text_to_tsvector_query from ...common.utils.db_exceptions import common_db_exceptions from ..utils import pg_query, rewrap_exceptions, wrap_in_class from .utils import transform_to_doc_reference -from ...common.nlp import text_to_tsvector_query # Raw query for text search search_docs_text_query = """ diff --git a/agents-api/agents_api/queries/docs/search_docs_hybrid.py b/agents-api/agents_api/queries/docs/search_docs_hybrid.py index d33347db7..1f7c363c4 100644 --- a/agents-api/agents_api/queries/docs/search_docs_hybrid.py +++ b/agents-api/agents_api/queries/docs/search_docs_hybrid.py @@ -5,8 +5,8 @@ from fastapi import HTTPException from ...autogen.openapi_model import DocReference -from ...common.utils.db_exceptions import common_db_exceptions from ...common.nlp import text_to_tsvector_query +from ...common.utils.db_exceptions import common_db_exceptions from ..utils import ( pg_query, rewrap_exceptions, From 9eb018f49469fc926da92a0d63b9557282a4f7f8 Mon Sep 17 00:00:00 2001 From: Ahmad Haidar Date: Wed, 15 Jan 2025 12:03:56 +0300 Subject: [PATCH 24/27] chore(agents-api): remove clean parameter from ``extract_keywords`` --- agents-api/agents_api/common/nlp.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 6c5f49f74..aea2380d8 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -42,7 +42,7 @@ def clean_keyword(kw: str) -> str: def extract_keywords( - doc: Doc, top_n: int = 25, clean: bool = True, split_chunks: bool = False + doc: Doc, top_n: int = 25, split_chunks: bool = False ) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { @@ -114,10 +114,7 @@ def extract_keywords( remaining_slots = max(0, top_n - len(top_keywords)) top_keywords += [kw for kw, _ in freq.most_common(remaining_slots)] - if clean: - return [clean_keyword(kw) for kw in top_keywords] - return top_keywords - + return [clean_keyword(kw) for kw in top_keywords] @lru_cache(maxsize=1000) def text_to_tsvector_query( From 9df8de412c4cb0cc2a23c8e123f92acafacb0d5b Mon Sep 17 00:00:00 2001 From: Ahmad-mtos Date: Wed, 15 Jan 2025 09:04:47 +0000 Subject: [PATCH 25/27] refactor: Lint agents-api (CI) --- agents-api/agents_api/common/nlp.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index aea2380d8..01f6ee7e1 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -41,9 +41,7 @@ def clean_keyword(kw: str) -> str: return WHITESPACE_RE.sub(" ", cleaned).strip() -def extract_keywords( - doc: Doc, top_n: int = 25, split_chunks: bool = False -) -> list[str]: +def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { "DATE", # Absolute or relative dates or periods. @@ -116,6 +114,7 @@ def extract_keywords( return [clean_keyword(kw) for kw in top_keywords] + @lru_cache(maxsize=1000) def text_to_tsvector_query( paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False From 8fe87cbab6fca684f0b17cafd96e4207ea13061a Mon Sep 17 00:00:00 2001 From: Ahmad Haidar Date: Wed, 15 Jan 2025 12:27:46 +0300 Subject: [PATCH 26/27] fix(agents-api): increase test coverage + set ``split_cuncks=Ture`` as default --- agents-api/agents_api/common/nlp.py | 4 ++-- .../queries/docs/search_docs_by_text.py | 2 +- .../queries/docs/search_docs_hybrid.py | 2 +- agents-api/tests/test_nlp_utilities.py | 18 ++++++++++++++++++ 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 01f6ee7e1..c89339ae2 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -41,7 +41,7 @@ def clean_keyword(kw: str) -> str: return WHITESPACE_RE.sub(" ", cleaned).strip() -def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> list[str]: +def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = True) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { "DATE", # Absolute or relative dates or periods. @@ -117,7 +117,7 @@ def extract_keywords(doc: Doc, top_n: int = 25, split_chunks: bool = False) -> l @lru_cache(maxsize=1000) def text_to_tsvector_query( - paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = False + paragraph: str, top_n: int = 25, min_keywords: int = 1, split_chunks: bool = True ) -> str: """ Extracts meaningful keywords/phrases from text and joins them with OR. diff --git a/agents-api/agents_api/queries/docs/search_docs_by_text.py b/agents-api/agents_api/queries/docs/search_docs_by_text.py index b98906466..6632d3162 100644 --- a/agents-api/agents_api/queries/docs/search_docs_by_text.py +++ b/agents-api/agents_api/queries/docs/search_docs_by_text.py @@ -62,7 +62,7 @@ async def search_docs_by_text( owner_types: list[str] = [owner[0] for owner in owners] owner_ids: list[str] = [str(owner[1]) for owner in owners] # Pre-process rawtext query - query = text_to_tsvector_query(query) + query = text_to_tsvector_query(query, split_chunks=True) return ( search_docs_text_query, diff --git a/agents-api/agents_api/queries/docs/search_docs_hybrid.py b/agents-api/agents_api/queries/docs/search_docs_hybrid.py index 1f7c363c4..6047069f8 100644 --- a/agents-api/agents_api/queries/docs/search_docs_hybrid.py +++ b/agents-api/agents_api/queries/docs/search_docs_hybrid.py @@ -83,7 +83,7 @@ async def search_docs_hybrid( owner_ids: list[str] = [str(owner[1]) for owner in owners] # Pre-process rawtext query - text_query = text_to_tsvector_query(text_query) + text_query = text_to_tsvector_query(text_query, split_chunks=True) return ( search_docs_hybrid_query, diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py index 82c22911a..5677de7d9 100644 --- a/agents-api/tests/test_nlp_utilities.py +++ b/agents-api/tests/test_nlp_utilities.py @@ -80,6 +80,15 @@ async def _(): "google OR john doe OR a software engineer", ), ("- google", "google"), + # Test duplicate keyword handling + ( + "John Doe is great. John Doe is awesome.", + "john doe", # Should only include "John Doe" once + ), + ( + "Software Engineer at Google. Also, a Software Engineer.", + "Google OR Also a Software Engineer OR Software Engineer", # Should only include "Software Engineer" once + ), ] for input_text, expected_output in test_cases: @@ -141,6 +150,15 @@ async def _(): "John Doe is a software engineer at Google.", "google OR john doe OR a OR software OR engineer", ), + # Test duplicate keyword handling + ( + "John Doe is great. John Doe is awesome.", + "john doe", # Should only include "John Doe" once even with split_chunks=True + ), + ( + "Software Engineer at Google. Also, a Software Engineer.", + "Also OR a OR google OR software OR engineer", # When split, each word appears once + ), ] for input_text, expected_output in test_cases: From fcd2ad30db40e69e55aedfc585479ba00ef79770 Mon Sep 17 00:00:00 2001 From: Ahmad Haidar Date: Wed, 15 Jan 2025 12:31:22 +0300 Subject: [PATCH 27/27] tests hotfix --- agents-api/tests/test_nlp_utilities.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/agents-api/tests/test_nlp_utilities.py b/agents-api/tests/test_nlp_utilities.py index 5677de7d9..733f695d5 100644 --- a/agents-api/tests/test_nlp_utilities.py +++ b/agents-api/tests/test_nlp_utilities.py @@ -26,11 +26,28 @@ async def _(): assert clean_keyword("- try") == "try" -@test("utility: extract_keywords") +@test("utility: extract_keywords - split_chunks=False") async def _(): nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"]) doc = nlp("John Doe is a software engineer at Google.") - assert set(extract_keywords(doc)) == {"John Doe", "a software engineer", "Google"} + assert set(extract_keywords(doc, split_chunks=False)) == { + "John Doe", + "a software engineer", + "Google", + } + + +@test("utility: extract_keywords - split_chunks=True") +async def _(): + nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat"]) + doc = nlp("John Doe is a software engineer at Google.") + assert set(extract_keywords(doc, split_chunks=True)) == { + "John Doe", + "a", + "software", + "engineer", + "Google", + } @test("utility: text_to_tsvector_query - split_chunks=False")