-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from ittia-research/dev
move pipelines to one single class, change to streaming search backend
- Loading branch information
Showing
9 changed files
with
361 additions
and
221 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .fetch import FetchUrl | ||
from .search import SearchWeb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import httpx | ||
import json | ||
from tenacity import retry, stop_after_attempt, wait_fixed | ||
|
||
import utils | ||
from settings import settings | ||
|
||
client = httpx.AsyncClient(http2=True, follow_redirects=True) | ||
|
||
class FetchUrl(): | ||
"""Fetch one single url via API fetch endpoint""" | ||
|
||
def __init__(self, url: str): | ||
self.url = url | ||
self.api = settings.SEARCH_BASE_URL + '/fetch' | ||
self.timeout = 120 # api request timeout, set higher cause api backend might need to try a few times | ||
|
||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(0.1), before_sleep=utils.retry_log_warning, reraise=True) | ||
async def get(self): | ||
_data = { | ||
'url': self.url, | ||
} | ||
response = await client.post(self.api, json=_data, timeout=self.timeout) | ||
_r = response.json() | ||
if _r['status'] != 'ok': | ||
raise Exception(f"Fetch url return status not ok: {self.url}") | ||
return _r['data'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import asyncio | ||
import httpx | ||
import json | ||
from tenacity import retry, stop_after_attempt, wait_fixed | ||
|
||
import utils | ||
from settings import settings | ||
|
||
class SearchWeb(): | ||
""" | ||
Web search with a query with session support: | ||
- get more links following the previous searches | ||
- get all links of this session | ||
""" | ||
def __init__(self, query: str): | ||
self.query = query | ||
self.api = settings.SEARCH_BASE_URL + '/search' | ||
self.timeout = 600 # api request timeout, set higher cause search backend might need to try a few times | ||
|
||
self.client = httpx.AsyncClient(http2=True, follow_redirects=True, timeout=self.timeout) | ||
self.urls = [] # all urls got | ||
|
||
""" | ||
Get JSON data from API stream output. | ||
TODO: | ||
- Is there a more standard way to process streamed JSON? | ||
""" | ||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(0.1), before_sleep=utils.retry_log_warning, reraise=True) | ||
async def get(self, num: int = 10, all: bool = False): | ||
_data = { | ||
'query': self.query, | ||
'num': num, # how many more urls to get | ||
'all': all, | ||
} | ||
async with self.client.stream("POST", self.api, json=_data) as response: | ||
buffer = "" | ||
async for chunk in response.aiter_text(): | ||
if chunk.strip(): # Only process non-empty chunks | ||
buffer += chunk | ||
|
||
# Attempt to load the buffer as JSON | ||
try: | ||
# Keep loading JSON until all data is consumed | ||
while buffer: | ||
# Try to load a complete JSON object | ||
rep, index = json.JSONDecoder().raw_decode(buffer) | ||
_url = rep['url'] | ||
# deduplication | ||
if _url not in self.urls: # TODO: waht if the new one containes same url but better metadata | ||
self.urls.append(_url) | ||
yield rep | ||
|
||
# Remove the processed part from the buffer | ||
buffer = buffer[index:].lstrip() # Remove processed JSON and any leading whitespace | ||
except json.JSONDecodeError: | ||
# If we encounter an error, we may not have a complete JSON object yet | ||
continue # Continue to read more data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.