Merge pull request #20 from ittia-research/dev

change default search backend, update endpoint
ittia-research · Sep 2, 2024 · c11d941 · c11d941
2 parents 1b7e5cb + d44b50e
commit c11d941
Show file tree

Hide file tree

Showing 10 changed files with 60 additions and 106 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,17 @@ True, false, or just opinions? Maybe not binary, but a percentage.
 Fact-checking tools to combat disinformation.
 
 ## Get Started
-Online demo: https://check.ittia.net
+Fact-check:
+  - Online demo: `https://check.ittia.net`
+  - API docs: `https://check.ittia.net/doc`
+
+Search backend:
+  - Using self-hosted search backend for better optimization.
+  - API doc: `https://search.ittia.net/doc`
+  - Features:
+    - Use first-class search engine, Google at this moment.
+    - Customize count of sources.
+    - Supports search session: streaming, resume.
 
 ## Design
 Input something.
@@ -14,7 +24,7 @@ Factcheck like what a researcher will do:
   * Use search engine as data source and AI as the verdit.
 
 Output analysis:
-  * MARKDOWN as the default format, JSON as one option. 
+  * MARKDOWN as the default format, JSON optional. 
 
 ### Pholosophy:
 - For new information, doubts as default, factcheck follows.
@@ -26,16 +36,9 @@ Input types:
 - questions
 
 Verdits:
-- true
 - false
-- uncheckable: can't check without more background
-- unavailable: service unavailable
-
-## Support
-Please contact if you can provide resources for this project:
-- AI API access
-- Hardware for hosting
-- Data sources
+- true
+- irrelevant: context processed irrelevant to the statement
 
 ## Todo
 ### Frontend
@@ -64,7 +67,6 @@ Retrieval
 
 ### pipeline
 DSPy:
-- [ ] make dspy.settings apply to sessions only in order to support multiple retrieve index
 - [ ] choose the right LLM temperature
 - [ ] better training datasets
 
@@ -82,7 +84,7 @@ DSPy:
 - [ ] Use multiple sources for factcheck.
 
 ### Stability
-- [ ] AI backend stress test, especially xinference.
+- [ ] Stress test.
 
 ### Extend
 - [ ] To other types of media: image, audio, video, etc.
@@ -97,9 +99,6 @@ DSPy:
 ### Logging
 - [ ] Full logging on chain of events for re-producing and debugging.
 
-### Doc
-- [ ] Show current tech stack.
-
 ### Checkout
 - [ ] Chroma #retrieve
 
@@ -110,12 +109,16 @@ DSPy:
 ## References
 ### Reports
 - [ ] AI-generated misinformation
+
 ### Factcheck
 - https://www.snopes.com
 - https://www.bmi.bund.de/SharedDocs/schwerpunkte/EN/disinformation/examples-of-russian-disinformation-and-the-facts.html
+
 ### Resources
-#### Inference
-- https://console.groq.com/docs/ (free tier)
+Inference
+  - https://console.groq.com/docs/ (free tier)
+Search and fetch:
+  - https://jina.ai/read
 
 ## Acknowledgements
 - TPU Research Cloud team at Google

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -3,6 +3,8 @@
   - Change from AutoGen to plain OpenAI, since AutoGen AssistantAgent adds system role which are not compateble with Gemma 2 + vllm.
 
 ## pipeline
+2028/9/2:
+  - Changed search backend to `https://search.ittia.net` for better optimization.
 2024/8/26:
   - Changed to multi-sources mode (divide sources based on hostname), instead of use all web search results as one single source.
 2024/8/13:

diff --git a/infra/env.d/check b/infra/env.d/check
@@ -22,6 +22,6 @@ RERANK_BASE_URL=http://infinity:7997
 RERANK_MODEL_DEPLOY=api
 RERANK_MODEL_NAME=jinaai/jina-reranker-v2-base-multilingual
 
-SEARCH_BASE_URL=https://s.jina.ai
+SEARCH_BASE_URL=https://search.ittia.net
 
 PROJECT_HOSTING_BASE_URL=http://127.0.0.1:8000
diff --git a/src/api/__init__.py b/src/api/__init__.py
@@ -1,2 +1,2 @@
-from .fetch import FetchUrl
+from .read import ReadUrl
 from .search import SearchWeb
diff --git a/src/api/fetch.py → src/api/read.py b/src/api/fetch.py → src/api/read.py
@@ -7,12 +7,12 @@
 
 client = httpx.AsyncClient(http2=True, follow_redirects=True)
 
-class FetchUrl():
-    """Fetch one single url via API fetch endpoint"""
+class ReadUrl():
+    """Read one single url via API fetch endpoint"""
 
     def __init__(self, url: str):
         self.url = url
-        self.api = settings.SEARCH_BASE_URL + '/fetch'
+        self.api = settings.SEARCH_BASE_URL + '/read'
         self.timeout = 120  # api request timeout, set higher cause api backend might need to try a few times
 
     @retry(stop=stop_after_attempt(3), wait=wait_fixed(0.1), before_sleep=utils.retry_log_warning, reraise=True)
@@ -23,5 +23,5 @@ async def get(self):
         response = await client.post(self.api, json=_data, timeout=self.timeout)
         _r = response.json()
         if _r['status'] != 'ok':
-            raise Exception(f"Fetch url return status not ok: {self.url}")
+            raise Exception(f"Read url return status not ok: {self.url}")  # TODO: avoid duplicated retry
         return _r['data']
diff --git a/src/main.py b/src/main.py
@@ -1,9 +1,9 @@
 import asyncio
 import json
+import logging
 from fastapi import FastAPI, HTTPException, Request, Header
 from fastapi.concurrency import run_in_threadpool
 from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, StreamingResponse
-import logging
 
 import pipeline, utils, web
 from modules import Search
@@ -17,64 +17,6 @@
 
 app = FastAPI()
 
-# """
-# Process input string, fact-check and output MARKDOWN
-# """
-# async def fact_check(input):
-#     status = 500
-#     logger.info(f"Fact checking: {input}")
-
-#     # get list of statements
-#     try:
-#         statements = await run_in_threadpool(pipeline.get_statements, input)
-#         logger.info(f"statements: {statements}")
-#     except Exception as e:
-#         logger.error(f"Get statements failed: {e}")
-#         raise HTTPException(status_code=status, detail="No statements found")
-
-#     verdicts = []
-#     fail_search = False
-#     for statement in statements:
-#         if not statement:
-#             continue
-#         logger.info(f"Statement: {statement}")
-
-#         # get search query
-#         try:
-#             query = await run_in_threadpool(pipeline.get_search_query, statement)
-#             logger.info(f"Search query: {query}")
-#         except Exception as e:
-#             logger.error(f"Getting search query from statement '{statement}' failed: {e}")
-#             continue
-
-#         # searching
-#         try:
-#             search = await Search(query)
-#             logger.info(f"Head of search results: {json.dumps(search)[0:500]}")
-#         except Exception as e:
-#             fail_search = True
-#             logger.error(f"Search '{query}' failed: {e}")
-#             continue
-
-#         # get verdict
-#         try:
-#             verdict = await run_in_threadpool(pipeline.get_verdict, search_json=search, statement=statement)
-#             logger.info(f"Verdict: {verdict}")
-#         except Exception as e:
-#             logger.error(f"Getting verdict for statement '{statement}' failed: {e}")
-#             continue
-
-#         verdicts.append(verdict)
-
-#     if not verdicts:
-#         if fail_search:
-#             raise HTTPException(status_code=status, detail="Search not available")
-#         else:
-#             raise HTTPException(status_code=status, detail="No verdicts found")
-
-#     report = utils.generate_report_markdown(input, verdicts)
-#     return report
-
 # TODO: multi-stage response
 async def stream_response(path):
     union = pipeline.Union(path)
@@ -98,10 +40,6 @@ async def stream_response(path):
 async def startup_event():
     pass
 
-@app.get("/robots.txt", response_class=FileResponse)
-async def robots():
-    return "web/robots.txt"
-
 @app.get("/health")
 async def health():
     return {"status": "ok"}
@@ -112,16 +50,16 @@ async def status():
     return _status
 
 # TODO: integrade error handle with output
-@app.get("/{path:path}", response_class=PlainTextResponse)
-async def catch_all(path: str, accept: str = Header(None)):
+@app.get("/{input:path}", response_class=PlainTextResponse)
+async def catch_all(input: str, accept: str = Header(None)):
     try:
-        if not utils.check_input(path):
-            return HTMLResponse(status_code=404, content="Invalid request")  # filter brower background requests
+        if not utils.check_input(input):
+            return HTMLResponse(status_code=404, content='not found')  # filter brower background requests
 
         if accept == "text/markdown":
-            if not path:
+            if not input:
                 return utils.get_stream(stage='final', content=web.get_homepage())
-            return StreamingResponse(stream_response(path), media_type="text/event-stream")
+            return StreamingResponse(stream_response(input), media_type="text/event-stream")
         else:
             return HTMLResponse(content=web.html_browser)
     except HTTPException as e:

diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py
@@ -7,7 +7,7 @@
 from urllib.parse import urlparse
 
 import utils
-from api import FetchUrl, SearchWeb
+from api import ReadUrl, SearchWeb
 from modules import SearchQuery, Statements
 from modules import llm_long, Citation, LlamaIndexRM, ContextVerdict
 from settings import settings
@@ -22,9 +22,13 @@ class Union():
     Run the full cycle from raw input to verdicts of multiple statements.
     Keep data in the class.
 
+    Exception handle:
+      - Mark doc as invalid if failed to read content.
+      
     TODO:
       - Add support of verdict standards.
       - Make betetr use of the other data of web search.
+      - Generate or draw class data stracture.
     """
 
     def __init__(self, input: str):
@@ -65,16 +69,19 @@ async def _pipe_source(self, data_source, statement):
         # update docs
         _task_docs = []
         for _, data_doc in data_source['docs'].items():
-            if not data_doc.get('doc'):  # TODO: better way to decide if update doc
+            if not data_doc.get('doc') and data_doc.get('valid') != False:  # TODO: better way to decide if update doc
                 _task_docs.append(asyncio.create_task(self.update_doc(data_doc)))
         await asyncio.gather(*_task_docs)  # finish all docs processing
 
         # update retriever
-        docs = [v['doc'] for v in data_source['docs'].values()]
-        data_source["retriever"] = await run_in_threadpool(LlamaIndexRM, docs=docs)
-
-        # update verdict, citation
-        await run_in_threadpool(self.update_verdict_citation, data_source, statement)
+        docs = [v['doc'] for v in data_source['docs'].values() if v.get('valid') != False]
+        if docs:
+            data_source["retriever"] = await run_in_threadpool(LlamaIndexRM, docs=docs)
+
+            # update verdict, citation
+            await run_in_threadpool(self.update_verdict_citation, data_source, statement)
+        else:
+            data_source['valid'] = False  # TODO: update status after add valid doc
 
     # Statements has retry set already, do not retry here
     async def get_statements(self):
@@ -120,7 +127,12 @@ async def update_source_map(self, data_sources, query):
 
     async def update_doc(self, data_doc):
         """Update doc (URL content for now)"""
-        _rep = await FetchUrl(url=data_doc['url']).get()
+        try:
+            _rep = await ReadUrl(url=data_doc['url']).get()
+        except:
+            data_doc['valid'] = False
+            logging.warning(f"Failed to read URL, mark as invalid: {data_doc['url']}")
+            return
         data_doc['raw'] = _rep  # dict including URL content and metadata, etc.
         data_doc['title'] = _rep['title']
         data_doc['doc'] = utils.search_result_to_doc(_rep)  # TODO: better process
@@ -164,6 +176,8 @@ def update_summary(self, data_statement):
         }
 
         for hostname, verdict in data_statement['sources'].items():
+            if verdict.get('valid') == False:
+                continue
             weight_total += 1
             v = verdict['verdict'].lower()
             if v in sum_citation:

diff --git a/src/settings.py b/src/settings.py
@@ -12,7 +12,7 @@ def __init__(self):
         self.EMBEDDING_BASE_URL = os.environ.get("EMBEDDING_BASE_URL") or "http://ollama:11434"
         self.RERANK_BASE_URL = os.environ.get("RERANK_BASE_URL") or "http://xinference:9997/v1"
         self.PROJECT_HOSTING_BASE_URL = os.environ.get("PROJECT_HOSTING_BASE_URL") or "https://check.ittia.net"
-        self.SEARCH_BASE_URL = os.environ.get("SEARCH_BASE_URL") or "https://s.jina.ai"
+        self.SEARCH_BASE_URL = os.environ.get("SEARCH_BASE_URL") or "https://search.ittia.net"
 
         # set RAG models deploy mode
         self.EMBEDDING_MODEL_DEPLOY = os.environ.get("EMBEDDING_MODEL_DEPLOY") or "local"

diff --git a/src/utils.py b/src/utils.py
@@ -65,7 +65,7 @@ def check_input(input):
 
     # check invalid whole query
     invalid_path = ['YOUR_FACT_CHECK_QUERY']
-    common_web_requests = ["favicon.ico"]
+    common_web_requests = ["robots.txt", "favicon.ico"]
     if input in itertools.chain(invalid_path, common_web_requests):
         return False
 

diff --git a/src/web/robots.txt b/src/web/robots.txt