Skip to content

Commit

Permalink
DH-4784 Store all responses and code refactoring (#195)
Browse files Browse the repository at this point in the history
* DH-4784 Store all responses and refactoring

* Calculate confidence score when a response is created

* Add GET endpoints to retrieve one resource for responses, table-descriptions and questions
  • Loading branch information
jcjc712 authored Oct 6, 2023
1 parent 9b56161 commit f57fde5
Show file tree
Hide file tree
Showing 45 changed files with 585 additions and 475 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,11 @@ curl -X 'PUT' \


### Querying the Database in Natural Language
Once you have connected the engine to your data warehouse (and preferably added some context to the store), you can query your data warehouse using the `POST /api/v1/question` endpoint.
Once you have connected the engine to your data warehouse (and preferably added some context to the store), you can query your data warehouse using the `POST /api/v1/questions` endpoint.

```
curl -X 'POST' \
'<host>/api/v1/question' \
'<host>/api/v1/questions' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
Expand Down
40 changes: 26 additions & 14 deletions dataherald/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@

from dataherald.api.types import Query
from dataherald.config import Component
from dataherald.db_scanner.models.types import TableSchemaDetail
from dataherald.db_scanner.models.types import TableDescription
from dataherald.sql_database.models.types import DatabaseConnection, SSHSettings
from dataherald.types import (
CreateResponseRequest,
DatabaseConnectionRequest,
ExecuteTempQueryRequest,
GoldenRecord,
GoldenRecordRequest,
Instruction,
InstructionRequest,
NLQueryResponse,
Question,
QuestionRequest,
Response,
ScannerRequest,
TableDescriptionRequest,
UpdateInstruction,
UpdateQueryRequest,
)


Expand All @@ -36,7 +36,15 @@ def scan_db(
pass

@abstractmethod
def answer_question(self, question_request: QuestionRequest) -> NLQueryResponse:
def answer_question(self, question_request: QuestionRequest) -> Response:
pass

@abstractmethod
def get_questions(self, db_connection_id: str | None = None) -> list[Question]:
pass

@abstractmethod
def get_question(self, question_id: str) -> Question:
pass

@abstractmethod
Expand All @@ -62,13 +70,17 @@ def update_table_description(
self,
table_description_id: str,
table_description_request: TableDescriptionRequest,
) -> TableSchemaDetail:
) -> TableDescription:
pass

@abstractmethod
def list_table_descriptions(
self, db_connection_id: str | None = None, table_name: str | None = None
) -> list[TableSchemaDetail]:
self, db_connection_id: str, table_name: str | None = None
) -> list[TableDescription]:
pass

@abstractmethod
def get_table_description(self, table_description_id: str) -> TableDescription:
pass

@abstractmethod
Expand All @@ -82,15 +94,15 @@ def execute_sql_query(self, query: Query) -> tuple[str, dict]:
pass

@abstractmethod
def update_nl_query_response(
self, query_id: str, query: UpdateQueryRequest
) -> NLQueryResponse:
def create_response(self, query_request: CreateResponseRequest) -> Response:
pass

@abstractmethod
def get_responses(self, question_id: str | None = None) -> list[Response]:
pass

@abstractmethod
def get_nl_query_response(
self, query_request: ExecuteTempQueryRequest
) -> NLQueryResponse:
def get_response(self, response_id: str) -> Response:
pass

@abstractmethod
Expand Down
148 changes: 82 additions & 66 deletions dataherald/api/fastapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List

from bson import json_util
from bson.objectid import ObjectId
from fastapi import BackgroundTasks, HTTPException
from overrides import override

Expand All @@ -13,14 +14,14 @@
from dataherald.context_store import ContextStore
from dataherald.db import DB
from dataherald.db_scanner import Scanner
from dataherald.db_scanner.models.types import TableDescriptionStatus, TableSchemaDetail
from dataherald.db_scanner.repository.base import DBScannerRepository
from dataherald.db_scanner.models.types import TableDescription, TableDescriptionStatus
from dataherald.db_scanner.repository.base import TableDescriptionRepository
from dataherald.eval import Evaluator
from dataherald.repositories.base import NLQueryResponseRepository
from dataherald.repositories.base import ResponseRepository
from dataherald.repositories.database_connections import DatabaseConnectionRepository
from dataherald.repositories.golden_records import GoldenRecordRepository
from dataherald.repositories.instructions import InstructionRepository
from dataherald.repositories.nl_question import NLQuestionRepository
from dataherald.repositories.question import QuestionRepository
from dataherald.sql_database.base import (
InvalidDBConnectionError,
SQLDatabase,
Expand All @@ -30,19 +31,18 @@
from dataherald.sql_generator import SQLGenerator
from dataherald.sql_generator.generates_nl_answer import GeneratesNlAnswer
from dataherald.types import (
CreateResponseRequest,
DatabaseConnectionRequest,
ExecuteTempQueryRequest,
GoldenRecord,
GoldenRecordRequest,
Instruction,
InstructionRequest,
NLQuery,
NLQueryResponse,
Question,
QuestionRequest,
Response,
ScannerRequest,
TableDescriptionRequest,
UpdateInstruction,
UpdateQueryRequest,
)

logger = logging.getLogger(__name__)
Expand All @@ -53,7 +53,7 @@ def async_scanning(scanner, database, scanner_request, storage):
database,
scanner_request.db_connection_id,
scanner_request.table_names,
DBScannerRepository(storage),
TableDescriptionRepository(storage),
)


Expand Down Expand Up @@ -103,7 +103,7 @@ def scan_db(
scanner.synchronizing(
scanner_request.table_names,
scanner_request.db_connection_id,
DBScannerRepository(self.storage),
TableDescriptionRepository(self.storage),
)

background_tasks.add_task(
Expand All @@ -112,20 +112,20 @@ def scan_db(
return True

@override
def answer_question(self, question_request: QuestionRequest) -> NLQueryResponse:
def answer_question(self, question_request: QuestionRequest) -> Response:
"""Takes in an English question and answers it based on content from the registered databases"""
logger.info(f"Answer question: {question_request.question}")
sql_generation = self.system.instance(SQLGenerator)
evaluator = self.system.instance(Evaluator)
context_store = self.system.instance(ContextStore)

user_question = NLQuery(
user_question = Question(
question=question_request.question,
db_connection_id=question_request.db_connection_id,
)

nl_question_repository = NLQuestionRepository(self.storage)
user_question = nl_question_repository.insert(user_question)
question_repository = QuestionRepository(self.storage)
user_question = question_repository.insert(user_question)

db_connection_repository = DatabaseConnectionRepository(self.storage)
database_connection = db_connection_repository.find_by_id(
Expand All @@ -149,9 +149,8 @@ def answer_question(self, question_request: QuestionRequest) -> NLQueryResponse:
raise HTTPException(status_code=404, detail=str(e)) from e
generated_answer.confidence_score = confidence_score
generated_answer.exec_time = time.time() - start_generated_answer
nl_query_response_repository = NLQueryResponseRepository(self.storage)
nl_query_response = nl_query_response_repository.insert(generated_answer)
return json.loads(json_util.dumps(nl_query_response))
response_repository = ResponseRepository(self.storage)
return response_repository.insert(generated_answer)

@override
def create_database_connection(
Expand Down Expand Up @@ -217,8 +216,8 @@ def update_table_description(
self,
table_description_id: str,
table_description_request: TableDescriptionRequest,
) -> TableSchemaDetail:
scanner_repository = DBScannerRepository(self.storage)
) -> TableDescription:
scanner_repository = TableDescriptionRepository(self.storage)
table = scanner_repository.find_by_id(table_description_id)

if not table:
Expand All @@ -238,11 +237,11 @@ def update_table_description(

@override
def list_table_descriptions(
self, db_connection_id: str | None = None, table_name: str | None = None
) -> list[TableSchemaDetail]:
scanner_repository = DBScannerRepository(self.storage)
self, db_connection_id: str, table_name: str | None = None
) -> list[TableDescription]:
scanner_repository = TableDescriptionRepository(self.storage)
table_descriptions = scanner_repository.find_by(
{"db_connection_id": db_connection_id, "table_name": table_name}
{"db_connection_id": ObjectId(db_connection_id), "table_name": table_name}
)

if db_connection_id:
Expand All @@ -260,7 +259,7 @@ def list_table_descriptions(
all_tables.remove(table_description.table_name)
for table in all_tables:
table_descriptions.append(
TableSchemaDetail(
TableDescription(
table_name=table,
status=TableDescriptionStatus.NOT_SYNCHRONIZED.value,
db_connection_id=db_connection_id,
Expand All @@ -270,6 +269,37 @@ def list_table_descriptions(

return table_descriptions

@override
def get_table_description(self, table_description_id: str) -> TableDescription:
scanner_repository = TableDescriptionRepository(self.storage)
return scanner_repository.find_by_id(table_description_id)

@override
def get_responses(self, question_id: str | None = None) -> list[Response]:
response_repository = ResponseRepository(self.storage)
query = {}
if question_id:
query = {"question_id": ObjectId(question_id)}
return response_repository.find_by(query)

@override
def get_response(self, response_id: str) -> Response:
response_repository = ResponseRepository(self.storage)
return response_repository.find_by_id(response_id)

@override
def get_questions(self, db_connection_id: str | None = None) -> list[Question]:
question_repository = QuestionRepository(self.storage)
query = {}
if db_connection_id:
query = {"db_connection_id": ObjectId(db_connection_id)}
return question_repository.find_by(query)

@override
def get_question(self, question_id: str) -> Question:
question_repository = QuestionRepository(self.storage)
return question_repository.find_by_id(question_id)

@override
def add_golden_records(
self, golden_records: List[GoldenRecordRequest]
Expand All @@ -295,53 +325,39 @@ def execute_sql_query(self, query: Query) -> tuple[str, dict]:
return result

@override
def update_nl_query_response(
self, query_id: str, query: UpdateQueryRequest # noqa: ARG002
) -> NLQueryResponse:
nl_query_response_repository = NLQueryResponseRepository(self.storage)
nl_question_repository = NLQuestionRepository(self.storage)
nl_query_response = nl_query_response_repository.find_by_id(query_id)
nl_question = nl_question_repository.find_by_id(
nl_query_response.nl_question_id
def create_response(
self, query_request: CreateResponseRequest # noqa: ARG002
) -> Response:
evaluator = self.system.instance(Evaluator)
question_repository = QuestionRepository(self.storage)
user_question = question_repository.find_by_id(query_request.question_id)
db_connection_repository = DatabaseConnectionRepository(self.storage)
database_connection = db_connection_repository.find_by_id(
user_question.db_connection_id
)
if nl_query_response.sql_query.strip() != query.sql_query.strip():
nl_query_response.sql_query = query.sql_query
evaluator = self.system.instance(Evaluator)
db_connection_repository = DatabaseConnectionRepository(self.storage)
database_connection = db_connection_repository.find_by_id(
nl_question.db_connection_id
)
if not database_connection:
raise HTTPException(
status_code=404, detail="Database connection not found"
)
try:
confidence_score = evaluator.get_confidence_score(
nl_question, nl_query_response, database_connection
)
nl_query_response.confidence_score = confidence_score
generates_nl_answer = GeneratesNlAnswer(self.system, self.storage)
nl_query_response = generates_nl_answer.execute(nl_query_response)
except SQLInjectionError as e:
raise HTTPException(status_code=404, detail=str(e)) from e
nl_query_response_repository.update(nl_query_response)
return json.loads(json_util.dumps(nl_query_response))
if not database_connection:
raise HTTPException(status_code=404, detail="Database connection not found")

@override
def get_nl_query_response(
self, query_request: ExecuteTempQueryRequest # noqa: ARG002
) -> NLQueryResponse:
nl_query_response_repository = NLQueryResponseRepository(self.storage)
nl_query_response = nl_query_response_repository.find_by_id(
query_request.query_id
response = Response(
question_id=query_request.question_id, sql_query=query_request.sql_query
)
nl_query_response.sql_query = query_request.sql_query
response_repository = ResponseRepository(self.storage)
response_repository.insert(response)
start_generated_answer = time.time()
try:
generates_nl_answer = GeneratesNlAnswer(self.system, self.storage)
nl_query_response = generates_nl_answer.execute(nl_query_response)
response = generates_nl_answer.execute(response)
confidence_score = evaluator.get_confidence_score(
user_question, response, database_connection
)
response.confidence_score = confidence_score
response.exec_time = time.time() - start_generated_answer
response_repository.update(response)
except ValueError as e:
raise HTTPException(status_code=404, detail=str(e)) from e
except SQLInjectionError as e:
raise HTTPException(status_code=404, detail=str(e)) from e
return json.loads(json_util.dumps(nl_query_response))
return response

@override
def delete_golden_record(self, golden_record_id: str) -> dict:
Expand All @@ -356,7 +372,7 @@ def get_golden_records(
golden_records_repository = GoldenRecordRepository(self.storage)
if db_connection_id:
return golden_records_repository.find_by(
{"db_connection_id": db_connection_id},
{"db_connection_id": ObjectId(db_connection_id)},
page=page,
limit=limit,
)
Expand All @@ -378,7 +394,7 @@ def get_instructions(
instruction_repository = InstructionRepository(self.storage)
if db_connection_id:
return instruction_repository.find_by(
{"db_connection_id": db_connection_id},
{"db_connection_id": ObjectId(db_connection_id)},
page=page,
limit=limit,
)
Expand Down
4 changes: 2 additions & 2 deletions dataherald/context_store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from dataherald.config import Component, System
from dataherald.db import DB
from dataherald.types import GoldenRecord, GoldenRecordRequest, NLQuery
from dataherald.types import GoldenRecord, GoldenRecordRequest, Question
from dataherald.vector_store import VectorStore


Expand All @@ -24,7 +24,7 @@ def __init__(self, system: System):

@abstractmethod
def retrieve_context_for_question(
self, nl_question: NLQuery, number_of_samples: int = 3
self, nl_question: Question, number_of_samples: int = 3
) -> Tuple[List[dict] | None, List[dict] | None]:
pass

Expand Down
Loading

0 comments on commit f57fde5

Please sign in to comment.