This repository has been archived by the owner on Feb 12, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5c25ae5
commit ae582f2
Showing
12 changed files
with
249 additions
and
112 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from elasticsearch import Elasticsearch | ||
|
||
# Initialize Elasticsearch client | ||
es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) | ||
|
||
# Create an index with the appropriate mapping | ||
mapping = { | ||
"mappings": { | ||
"properties": { | ||
"title": {"type": "text"}, | ||
"url": {"type": "keyword"}, | ||
"paragraph_id": {"type": "integer"}, | ||
"content": {"type": "text"}, | ||
"embedding": {"type": "dense_vector", "dims": 384} # Adjust the dims based on the model output dimension | ||
} | ||
} | ||
} | ||
|
||
es.indices.create(index="documentation_files", body=mapping, ignore=400) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from elasticsearch import Elasticsearch | ||
from sentence_transformers import SentenceTransformer | ||
|
||
# Initialize Elasticsearch client | ||
es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) | ||
|
||
# Load the sentence transformer model | ||
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3') | ||
|
||
# Define a function to fetch and parse webpages | ||
def fetch_and_parse(url): | ||
response = requests.get(url) | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
# Extract webpage title | ||
title = soup.title.string if soup.title else "No title found" | ||
|
||
# Extract and store paragraphs as separate documents | ||
paragraphs = [] | ||
for i, p in enumerate(soup.find_all('p')): | ||
content = p.get_text().strip() | ||
# Compute the paragraph embedding | ||
embedding = model.encode(content, convert_to_tensor=True, show_progress_bar=False).tolist() | ||
|
||
paragraphs.append({ | ||
'url': url, | ||
'title': title, | ||
'paragraph_id': i, | ||
'content': content, | ||
'embedding': embedding | ||
}) | ||
|
||
return paragraphs | ||
|
||
# List of URLs to index | ||
urls = [ | ||
"https://en.wikipedia.org/wiki/Web_scraping", | ||
"https://en.wikipedia.org/wiki/Elasticsearch", | ||
"https://en.wikipedia.org/wiki/Python_(programming_language)" | ||
] | ||
|
||
# Index the webpages in Elasticsearch | ||
for url in urls: | ||
parsed_pages = fetch_and_parse(url) | ||
|
||
for page in parsed_pages: | ||
# Index the paragraph as a separate document in Elasticsearch | ||
es.index(index='documentation_files', doc_type='_doc', body=page) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Use the official Python image as the base image | ||
FROM python:3.9 | ||
|
||
# Set the working directory | ||
WORKDIR /app | ||
|
||
# Copy the requirements.txt file into the container | ||
COPY requirements.txt . | ||
|
||
# Install the dependencies | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Copy the rest of the application files into the container | ||
COPY . . | ||
|
||
# Expose the port the app runs on | ||
EXPOSE 5000 | ||
|
||
# Run the Flask application | ||
CMD ["flask", "run", "--host", "0.0.0.0"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from elasticsearch import Elasticsearch, RequestsHttpConnection | ||
from requests.adapters import HTTPAdapter | ||
from requests.packages.urllib3.util.retry import Retry | ||
|
||
from flask import Flask, render_template, request | ||
from transformers import pipeline | ||
from sentence_transformers import SentenceTransformer | ||
|
||
app = Flask(__name__) | ||
|
||
|
||
model_name = "deepset/roberta-base-squad2" | ||
|
||
nlp = pipeline('question-answering', model=model_name, | ||
tokenizer=model_name, padding=True, truncation=True) | ||
|
||
# Load the sentence transformer model | ||
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3') | ||
|
||
# Define a retry strategy for the Elasticsearch client | ||
retry_strategy = Retry( | ||
total=3, | ||
status_forcelist=[429, 500, 502, 503, 504], | ||
allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "POST"], | ||
backoff_factor=1 | ||
) | ||
|
||
adapter = HTTPAdapter(max_retries=retry_strategy) | ||
|
||
# Initialize the Elasticsearch client with the retry strategy | ||
es = Elasticsearch( | ||
[{'host': 'elasticsearch', 'port': 9200}], | ||
connection_class=RequestsHttpConnection, | ||
max_retries=3 | ||
) | ||
es.transport.connection_pool.adapter = adapter | ||
|
||
|
||
|
||
@app.route("/", methods=["GET", "POST"]) | ||
def index(): | ||
if request.method == "POST": | ||
question = request.form["question"] | ||
|
||
# Compute the question embedding | ||
question_embedding = model.encode( | ||
question, convert_to_tensor=True, show_progress_bar=False).tolist() | ||
|
||
index_name = 'documentation_files' | ||
|
||
query = { | ||
"query": { | ||
"script_score": { | ||
"query": {"match_all": {}}, | ||
"script": { | ||
"source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", | ||
"params": {"query_vector": question_embedding} | ||
} | ||
} | ||
}, | ||
"size": 1 | ||
} | ||
response = es.search(index=index_name, body=query) | ||
passages = [hit['_source'] for hit in response['hits']['hits']] | ||
|
||
answers = [] | ||
for passage in passages: | ||
QA_input = { | ||
'question': question, | ||
'context': passage['title'] + ' ' + passage['content'] | ||
} | ||
answer = nlp(QA_input)['answer'] | ||
|
||
answers.append([answer, passage]) | ||
|
||
return render_template("index.html", question=question, answers=answers) | ||
else: | ||
return render_template("index.html", question=None, answers=None) | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(debug=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Flask==2.1.1 | ||
transformers==4.12.2 | ||
elasticsearch==7.15.2 | ||
torch | ||
sentence-transformers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="utf-8"> | ||
<meta name="viewport" content="width=device-width, initial-scale=1"> | ||
<title>Question-Answering Search</title> | ||
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"> | ||
</head> | ||
<body> | ||
<div class="container mt-5"> | ||
<h1 class="mb-4">Question-Answering Search</h1> | ||
<form action="/" method="post"> | ||
<div class="mb-3"> | ||
<label for="question" class="form-label">Enter your question:</label> | ||
<input type="text" class="form-control" id="question" name="question" required> | ||
</div> | ||
<button type="submit" class="btn btn-primary">Search</button> | ||
</form> | ||
{% if question and answers %} | ||
<h2 class="mt-5">Question: {{ question }}</h2> | ||
<p> The answer is <strong>{{ answers[0][0] }} </strong></p> | ||
<br /> | ||
<p>Sources:</p> | ||
<ol> | ||
{% for answer in answers %} | ||
<li> | ||
<div> | ||
<h3> <a href="{{answer[1]['url']}}"> {{ answer[1]['title']}} </a> </h3> | ||
<p>{{answer[1]['content']}}</p> | ||
<hr> | ||
</div> | ||
</li> | ||
{% endfor %} | ||
</ol> | ||
{% endif %} | ||
</div> | ||
</body> | ||
</html> |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
requests==2.26.0 | ||
elasticsearch==7.16.0 | ||
transformers | ||
torch | ||
torch | ||
beautifulsoup4 | ||
sentence-transformers |
Oops, something went wrong.