From ee3a450de5a92d031ecf62cddda927edd88163d2 Mon Sep 17 00:00:00 2001 From: etwk <48991073+etwk@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:47:42 +0000 Subject: [PATCH 1/2] debug --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 1e1c76d..af68610 100644 --- a/src/main.py +++ b/src/main.py @@ -3,7 +3,7 @@ import logging from fastapi import FastAPI, HTTPException, Request, Header from fastapi.concurrency import run_in_threadpool -from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, StreamingResponse +from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, RedirectResponse, StreamingResponse import pipeline, utils, web from modules import Search From 9110b06bb962643fb705f9bf9db118f37c42cb08 Mon Sep 17 00:00:00 2001 From: etwk <48991073+etwk@users.noreply.github.com> Date: Tue, 3 Sep 2024 14:19:16 +0000 Subject: [PATCH 2/2] lint --- datasets/wiki_dpr/prepare_files.py | 5 ++--- src/api/__init__.py | 2 ++ src/api/read.py | 1 - src/api/search.py | 1 - src/integrations/__init__.py | 2 ++ src/main.py | 11 +++++------ src/modules/__init__.py | 16 +++++++++------- src/modules/retrieve.py | 13 ++++--------- src/pipeline/__init__.py | 9 +++++---- src/settings.py | 5 +++-- src/web/__init__.py | 2 ++ src/web/homepage.py | 2 +- 12 files changed, 35 insertions(+), 34 deletions(-) diff --git a/datasets/wiki_dpr/prepare_files.py b/datasets/wiki_dpr/prepare_files.py index 9782536..38ad959 100644 --- a/datasets/wiki_dpr/prepare_files.py +++ b/datasets/wiki_dpr/prepare_files.py @@ -1,4 +1,5 @@ -import os, subprocess +import os +import subprocess import shutil from huggingface_hub import snapshot_download from tenacity import retry, stop_after_attempt, wait_fixed @@ -23,8 +24,6 @@ ] revision = "main" -import os - def check_exists(folder_path): # Check if the folder exists if os.path.exists(folder_path) and os.path.isdir(folder_path): diff --git a/src/api/__init__.py b/src/api/__init__.py index 4cfb7fd..d8face1 100644 --- a/src/api/__init__.py +++ b/src/api/__init__.py @@ -1,2 +1,4 @@ +__all__ = ['ReadUrl', 'SearchWeb'] + from .read import ReadUrl from .search import SearchWeb diff --git a/src/api/read.py b/src/api/read.py index 4151fd1..38614fc 100644 --- a/src/api/read.py +++ b/src/api/read.py @@ -1,5 +1,4 @@ import httpx -import json from tenacity import retry, stop_after_attempt, wait_fixed import utils diff --git a/src/api/search.py b/src/api/search.py index 5bef64f..a9ef90a 100644 --- a/src/api/search.py +++ b/src/api/search.py @@ -1,4 +1,3 @@ -import asyncio import httpx import json from tenacity import retry, stop_after_attempt, wait_fixed diff --git a/src/integrations/__init__.py b/src/integrations/__init__.py index 2276b56..8452465 100644 --- a/src/integrations/__init__.py +++ b/src/integrations/__init__.py @@ -1,2 +1,4 @@ +__all__ = ['InfinityEmbedding', 'OllamaEmbedding'] + from .infinity_embedding import InfinityEmbedding from .ollama_embedding import OllamaEmbedding \ No newline at end of file diff --git a/src/main.py b/src/main.py index af68610..cd2ab4f 100644 --- a/src/main.py +++ b/src/main.py @@ -1,12 +1,11 @@ import asyncio -import json import logging -from fastapi import FastAPI, HTTPException, Request, Header -from fastapi.concurrency import run_in_threadpool -from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, RedirectResponse, StreamingResponse +from fastapi import FastAPI, HTTPException, Header +from fastapi.responses import HTMLResponse, PlainTextResponse, RedirectResponse, StreamingResponse -import pipeline, utils, web -from modules import Search +import pipeline +import utils +import web from settings import settings logging.basicConfig( diff --git a/src/modules/__init__.py b/src/modules/__init__.py index 4afb550..2be0446 100644 --- a/src/modules/__init__.py +++ b/src/modules/__init__.py @@ -1,3 +1,12 @@ +__all__ = ['Citation', 'ContextVerdict', 'LlamaIndexRM', 'Search', 'SearchQuery', 'Statements'] + +from .citation import Citation +from .context_verdict import ContextVerdict +from .retrieve import LlamaIndexRM +from .search import Search +from .search_query import SearchQuery +from .statements import Statements + import dspy from settings import settings @@ -8,10 +17,3 @@ # LM with higher token limits llm_long = dspy.OpenAI(model=settings.LLM_MODEL_NAME, api_base=f"{settings.OPENAI_BASE_URL}/", max_tokens=500, stop='\n\n') - -from .citation import Citation -from .context_verdict import ContextVerdict -from .retrieve import LlamaIndexRM -from .search import Search -from .search_query import SearchQuery -from .statements import Statements \ No newline at end of file diff --git a/src/modules/retrieve.py b/src/modules/retrieve.py index d713aa0..4eb4bab 100644 --- a/src/modules/retrieve.py +++ b/src/modules/retrieve.py @@ -1,9 +1,8 @@ -import concurrent.futures +import dspy import logging from typing import Optional from llama_index.core import ( - Document, Settings, StorageContext, VectorStoreIndex, @@ -13,15 +12,13 @@ from llama_index.core.indices.postprocessor import SentenceTransformerRerank from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.llms import MockLLM - -Settings.llm = MockLLM(max_tokens=256) # retrieve only, do not use LLM for synthesize +from llama_index.postprocessor.jinaai_rerank import JinaRerank import utils +from integrations import InfinityEmbedding, OllamaEmbedding from settings import settings -from llama_index.postprocessor.jinaai_rerank import JinaRerank - -from integrations import InfinityEmbedding +Settings.llm = MockLLM(max_tokens=256) # retrieve only, do not use LLM for synthesize if settings.EMBEDDING_MODEL_DEPLOY == "local": embed_model="local:" + settings.EMBEDDING_MODEL_NAME @@ -132,8 +129,6 @@ def retrieve(self, query): return contexts -import dspy - NO_TOP_K_WARNING = "The underlying LlamaIndex retriever does not support top k retrieval. Ignoring k value." class LlamaIndexRM(dspy.Retrieve): diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py index 9d2b5aa..87d2b7a 100644 --- a/src/pipeline/__init__.py +++ b/src/pipeline/__init__.py @@ -2,6 +2,7 @@ import dspy import logging import os +from fastapi import HTTPException from fastapi.concurrency import run_in_threadpool from tenacity import retry, stop_after_attempt, wait_fixed from urllib.parse import urlparse @@ -69,12 +70,12 @@ async def _pipe_source(self, data_source, statement): # update docs _task_docs = [] for _, data_doc in data_source['docs'].items(): - if not data_doc.get('doc') and data_doc.get('valid') != False: # TODO: better way to decide if update doc + if not data_doc.get('doc') and data_doc.get('valid') is not False: # TODO: better way to decide if update doc _task_docs.append(asyncio.create_task(self.update_doc(data_doc))) await asyncio.gather(*_task_docs) # finish all docs processing # update retriever - docs = [v['doc'] for v in data_source['docs'].values() if v.get('valid') != False] + docs = [v['doc'] for v in data_source['docs'].values() if v.get('valid') is not False] if docs: data_source["retriever"] = await run_in_threadpool(LlamaIndexRM, docs=docs) @@ -129,7 +130,7 @@ async def update_doc(self, data_doc): """Update doc (URL content for now)""" try: _rep = await ReadUrl(url=data_doc['url']).get() - except: + except Exception: data_doc['valid'] = False logging.warning(f"Failed to read URL, mark as invalid: {data_doc['url']}") return @@ -176,7 +177,7 @@ def update_summary(self, data_statement): } for hostname, verdict in data_statement['sources'].items(): - if verdict.get('valid') == False: + if verdict.get('valid') is False: continue weight_total += 1 v = verdict['verdict'].lower() diff --git a/src/settings.py b/src/settings.py index b1c0cf7..b5c85a0 100644 --- a/src/settings.py +++ b/src/settings.py @@ -1,4 +1,5 @@ -import os, ast +import ast +import os class Settings: def __init__(self): @@ -28,7 +29,7 @@ def __init__(self): # set Index chunk sizes try: self.INDEX_CHUNK_SIZES = ast.literal_eval(os.environ.get("INDEX_CHUNK_SIZES")) - except: + except (ValueError, SyntaxError): self.INDEX_CHUNK_SIZES = [1024, 256] """ diff --git a/src/web/__init__.py b/src/web/__init__.py index 7050ab5..f883ef2 100644 --- a/src/web/__init__.py +++ b/src/web/__init__.py @@ -1,2 +1,4 @@ +__all__ = ['get_homepage', 'html_browser'] + from .homepage import get_homepage from .html import html_browser \ No newline at end of file diff --git a/src/web/homepage.py b/src/web/homepage.py index 40fbd09..f6e08cf 100644 --- a/src/web/homepage.py +++ b/src/web/homepage.py @@ -4,7 +4,7 @@ def get_homepage(): # get tech stack stack = utils.get_stack() - md = f"## Tech stack\n" + md = "## Tech stack\n" lines = [md] lines.extend([f"**{key}**: {value}" for key, value in stack.items()]) md = "\n\n".join(lines)