Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add db query to retrieve page content for summarize #51

Merged
merged 1 commit into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def main():
asyncio.run(pipeline(online=False))
elif args.server:
# Start the server
start_server(debug=args.debug)
start_server(debug=args.debug, con=con)
elif args.file:
# Rank the queries from the file
queries = rank_from_file(args.file)
Expand Down
42 changes: 33 additions & 9 deletions engine/server.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import lzma
import pickle

import duckdb
import flask
from flask import Flask, jsonify, request, Response
from flask_cors import CORS, cross_origin

from custom_db import get_page_by_id
from preview import load_preview
from rank import rank
from summarize import get_summary_model
Expand All @@ -15,9 +18,13 @@
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})

dbcon: duckdb.DuckDBPyConnection = None


def start_server(debug=False):
def start_server(debug=False, con: duckdb.DuckDBPyConnection = None):
print("Starting server...")
global dbcon
dbcon = con
app.run(port=PORT, debug=debug, use_reloader=debug)


Expand Down Expand Up @@ -80,14 +87,28 @@ def summarize(doc_id):
"summary": ""
}

# Get the document by ID
doc = get_page_by_id(doc_id)
if doc.empty:
return Response("Document not found", status=404)
con = dbcon.cursor()
blob = con.execute("""
SELECT c.content
FROM documents AS d, crawled AS c
WHERE d.id = ?
AND d.link = c.link
""", [doc_id]).fetchall()[0][0]
con.close()

soup = pickle.loads(lzma.decompress(blob))
main_content = soup.find("main") or soup.find("article") \
or soup.find("section") or soup.find("body")

if main_content is None:
print(f"Warning: No main content found for {doc_id}. Using entire body.")
main_content = soup

# Get the text from the document
text = doc['text'].values[0]
text = main_content.get_text()

# Summarize the text
summarized_text = get_summary_model().summarize(text)

result["summary"] = summarized_text

return jsonify(result)
Expand All @@ -104,4 +125,7 @@ def site_map():


if __name__ == "__main__":
start_server()
dbcon = duckdb.connect("crawlies.db")
start_server(con=dbcon)
dbcon.close()