From ee3a450de5a92d031ecf62cddda927edd88163d2 Mon Sep 17 00:00:00 2001
From: etwk <48991073+etwk@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:47:42 +0000
Subject: [PATCH 1/2] debug

---
 src/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.py b/src/main.py
index 1e1c76d..af68610 100644
--- a/src/main.py
+++ b/src/main.py
@@ -3,7 +3,7 @@
 import logging
 from fastapi import FastAPI, HTTPException, Request, Header
 from fastapi.concurrency import run_in_threadpool
-from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, StreamingResponse
+from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, RedirectResponse, StreamingResponse
 
 import pipeline, utils, web
 from modules import Search

From 9110b06bb962643fb705f9bf9db118f37c42cb08 Mon Sep 17 00:00:00 2001
From: etwk <48991073+etwk@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:19:16 +0000
Subject: [PATCH 2/2] lint

---
 datasets/wiki_dpr/prepare_files.py |  5 ++---
 src/api/__init__.py                |  2 ++
 src/api/read.py                    |  1 -
 src/api/search.py                  |  1 -
 src/integrations/__init__.py       |  2 ++
 src/main.py                        | 11 +++++------
 src/modules/__init__.py            | 16 +++++++++-------
 src/modules/retrieve.py            | 13 ++++---------
 src/pipeline/__init__.py           |  9 +++++----
 src/settings.py                    |  5 +++--
 src/web/__init__.py                |  2 ++
 src/web/homepage.py                |  2 +-
 12 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/datasets/wiki_dpr/prepare_files.py b/datasets/wiki_dpr/prepare_files.py
index 9782536..38ad959 100644
--- a/datasets/wiki_dpr/prepare_files.py
+++ b/datasets/wiki_dpr/prepare_files.py
@@ -1,4 +1,5 @@
-import os, subprocess
+import os
+import subprocess
 import shutil
 from huggingface_hub import snapshot_download
 from tenacity import retry, stop_after_attempt, wait_fixed
@@ -23,8 +24,6 @@
 ]
 revision = "main"
 
-import os
-
 def check_exists(folder_path):
     # Check if the folder exists
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
diff --git a/src/api/__init__.py b/src/api/__init__.py
index 4cfb7fd..d8face1 100644
--- a/src/api/__init__.py
+++ b/src/api/__init__.py
@@ -1,2 +1,4 @@
+__all__ = ['ReadUrl', 'SearchWeb']
+
 from .read import ReadUrl
 from .search import SearchWeb
diff --git a/src/api/read.py b/src/api/read.py
index 4151fd1..38614fc 100644
--- a/src/api/read.py
+++ b/src/api/read.py
@@ -1,5 +1,4 @@
 import httpx
-import json
 from tenacity import retry, stop_after_attempt, wait_fixed
 
 import utils
diff --git a/src/api/search.py b/src/api/search.py
index 5bef64f..a9ef90a 100644
--- a/src/api/search.py
+++ b/src/api/search.py
@@ -1,4 +1,3 @@
-import asyncio
 import httpx
 import json
 from tenacity import retry, stop_after_attempt, wait_fixed
diff --git a/src/integrations/__init__.py b/src/integrations/__init__.py
index 2276b56..8452465 100644
--- a/src/integrations/__init__.py
+++ b/src/integrations/__init__.py
@@ -1,2 +1,4 @@
+__all__ = ['InfinityEmbedding', 'OllamaEmbedding']
+
 from .infinity_embedding import InfinityEmbedding
 from .ollama_embedding import OllamaEmbedding
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
index af68610..cd2ab4f 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,12 +1,11 @@
 import asyncio
-import json
 import logging
-from fastapi import FastAPI, HTTPException, Request, Header
-from fastapi.concurrency import run_in_threadpool
-from fastapi.responses import Response, JSONResponse, HTMLResponse, PlainTextResponse, FileResponse, RedirectResponse, StreamingResponse
+from fastapi import FastAPI, HTTPException, Header
+from fastapi.responses import HTMLResponse, PlainTextResponse, RedirectResponse, StreamingResponse
 
-import pipeline, utils, web
-from modules import Search
+import pipeline
+import utils
+import web
 from settings import settings
 
 logging.basicConfig(
diff --git a/src/modules/__init__.py b/src/modules/__init__.py
index 4afb550..2be0446 100644
--- a/src/modules/__init__.py
+++ b/src/modules/__init__.py
@@ -1,3 +1,12 @@
+__all__ = ['Citation', 'ContextVerdict', 'LlamaIndexRM', 'Search', 'SearchQuery', 'Statements']
+
+from .citation import Citation
+from .context_verdict import ContextVerdict
+from .retrieve import LlamaIndexRM
+from .search import Search
+from .search_query import SearchQuery
+from .statements import Statements
+
 import dspy
 
 from settings import settings
@@ -8,10 +17,3 @@
 
 # LM with higher token limits
 llm_long = dspy.OpenAI(model=settings.LLM_MODEL_NAME, api_base=f"{settings.OPENAI_BASE_URL}/", max_tokens=500, stop='\n\n')
-
-from .citation import Citation
-from .context_verdict import ContextVerdict
-from .retrieve import LlamaIndexRM
-from .search import Search
-from .search_query import SearchQuery
-from .statements import Statements
\ No newline at end of file
diff --git a/src/modules/retrieve.py b/src/modules/retrieve.py
index d713aa0..4eb4bab 100644
--- a/src/modules/retrieve.py
+++ b/src/modules/retrieve.py
@@ -1,9 +1,8 @@
-import concurrent.futures
+import dspy
 import logging
 from typing import Optional
 
 from llama_index.core import (
-    Document,
     Settings,
     StorageContext,
     VectorStoreIndex,
@@ -13,15 +12,13 @@
 from llama_index.core.indices.postprocessor import SentenceTransformerRerank
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.llms import MockLLM
-
-Settings.llm = MockLLM(max_tokens=256)  # retrieve only, do not use LLM for synthesize
+from llama_index.postprocessor.jinaai_rerank import JinaRerank
 
 import utils
+from integrations import InfinityEmbedding, OllamaEmbedding
 from settings import settings
 
-from llama_index.postprocessor.jinaai_rerank import JinaRerank
-
-from integrations import InfinityEmbedding
+Settings.llm = MockLLM(max_tokens=256)  # retrieve only, do not use LLM for synthesize
 
 if settings.EMBEDDING_MODEL_DEPLOY == "local":
     embed_model="local:" + settings.EMBEDDING_MODEL_NAME
@@ -132,8 +129,6 @@ def retrieve(self, query):
             
         return contexts
 
-import dspy
-
 NO_TOP_K_WARNING = "The underlying LlamaIndex retriever does not support top k retrieval. Ignoring k value."
 
 class LlamaIndexRM(dspy.Retrieve):
diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py
index 9d2b5aa..87d2b7a 100644
--- a/src/pipeline/__init__.py
+++ b/src/pipeline/__init__.py
@@ -2,6 +2,7 @@
 import dspy
 import logging
 import os
+from fastapi import HTTPException
 from fastapi.concurrency import run_in_threadpool
 from tenacity import retry, stop_after_attempt, wait_fixed
 from urllib.parse import urlparse
@@ -69,12 +70,12 @@ async def _pipe_source(self, data_source, statement):
         # update docs
         _task_docs = []
         for _, data_doc in data_source['docs'].items():
-            if not data_doc.get('doc') and data_doc.get('valid') != False:  # TODO: better way to decide if update doc
+            if not data_doc.get('doc') and data_doc.get('valid') is not False:  # TODO: better way to decide if update doc
                 _task_docs.append(asyncio.create_task(self.update_doc(data_doc)))
         await asyncio.gather(*_task_docs)  # finish all docs processing
 
         # update retriever
-        docs = [v['doc'] for v in data_source['docs'].values() if v.get('valid') != False]
+        docs = [v['doc'] for v in data_source['docs'].values() if v.get('valid') is not False]
         if docs:
             data_source["retriever"] = await run_in_threadpool(LlamaIndexRM, docs=docs)
             
@@ -129,7 +130,7 @@ async def update_doc(self, data_doc):
         """Update doc (URL content for now)"""
         try:
             _rep = await ReadUrl(url=data_doc['url']).get()
-        except:
+        except Exception:
             data_doc['valid'] = False
             logging.warning(f"Failed to read URL, mark as invalid: {data_doc['url']}")
             return
@@ -176,7 +177,7 @@ def update_summary(self, data_statement):
         }
     
         for hostname, verdict in data_statement['sources'].items():
-            if verdict.get('valid') == False:
+            if verdict.get('valid') is False:
                 continue
             weight_total += 1
             v = verdict['verdict'].lower()
diff --git a/src/settings.py b/src/settings.py
index b1c0cf7..b5c85a0 100644
--- a/src/settings.py
+++ b/src/settings.py
@@ -1,4 +1,5 @@
-import os, ast
+import ast
+import os
 
 class Settings:
     def __init__(self):
@@ -28,7 +29,7 @@ def __init__(self):
         # set Index chunk sizes
         try:
             self.INDEX_CHUNK_SIZES = ast.literal_eval(os.environ.get("INDEX_CHUNK_SIZES"))
-        except:
+        except (ValueError, SyntaxError):
             self.INDEX_CHUNK_SIZES = [1024, 256]
 
         """
diff --git a/src/web/__init__.py b/src/web/__init__.py
index 7050ab5..f883ef2 100644
--- a/src/web/__init__.py
+++ b/src/web/__init__.py
@@ -1,2 +1,4 @@
+__all__ = ['get_homepage', 'html_browser']
+
 from .homepage import get_homepage
 from .html import html_browser
\ No newline at end of file
diff --git a/src/web/homepage.py b/src/web/homepage.py
index 40fbd09..f6e08cf 100644
--- a/src/web/homepage.py
+++ b/src/web/homepage.py
@@ -4,7 +4,7 @@
 def get_homepage():
     # get tech stack
     stack = utils.get_stack()
-    md = f"## Tech stack\n"
+    md = "## Tech stack\n"
     lines = [md]
     lines.extend([f"**{key}**: {value}" for key, value in stack.items()])
     md = "\n\n".join(lines)