From ae582f2d6112302ad41273750b41cb8eee7fb879 Mon Sep 17 00:00:00 2001 From: Achilleas Moraitis Date: Fri, 17 Mar 2023 14:58:21 +0100 Subject: [PATCH] feat: qa with semantic search --- README.md | 25 ++++++++++- data-import/create_index.py | 19 ++++++++ data-import/webpages.py | 50 +++++++++++++++++++++ docker-compose.yml | 10 ++++- document-imports/markdown.py | 24 ---------- flask_app/Dockerfile | 20 +++++++++ flask_app/app.py | 82 ++++++++++++++++++++++++++++++++++ flask_app/requirements.txt | 5 +++ flask_app/templates/index.html | 38 ++++++++++++++++ ml-search.py | 50 --------------------- requirements.txt | 4 +- search.py | 34 -------------- 12 files changed, 249 insertions(+), 112 deletions(-) create mode 100644 data-import/create_index.py create mode 100644 data-import/webpages.py delete mode 100644 document-imports/markdown.py create mode 100644 flask_app/Dockerfile create mode 100644 flask_app/app.py create mode 100644 flask_app/requirements.txt create mode 100644 flask_app/templates/index.html delete mode 100644 ml-search.py delete mode 100644 search.py diff --git a/README.md b/README.md index 5acd313..7663ca9 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,13 @@ Documentation never has been so fun 😎 +This is a question and answering system that uses semantic search and a qa ml model +to give an answer to the user. + +The system will give the answer and it's source as well. + +https://user-images.githubusercontent.com/4193340/226748890-2415ca7e-e13a-4e5e-9f82-c56a1a6408cc.mp4 + ## Setup ```bash @@ -19,8 +26,22 @@ pip install -r requirements.txt ``` ```bash -docker-compose up -d +docker-compose up -d build ``` -Kibana link http://localhost:5601 \ No newline at end of file +Kibana link http://localhost:5601 +The Flask application should be accessible at http://localhost:5001. + +## Elastic Search +The first time you will need to add the index for storing the documents + +```bash +python /data-import/create_index.py +``` + +To import some sample pages you can run + +```bash +python /data-import/webpages.py +``` diff --git a/data-import/create_index.py b/data-import/create_index.py new file mode 100644 index 0000000..bdd4c5c --- /dev/null +++ b/data-import/create_index.py @@ -0,0 +1,19 @@ +from elasticsearch import Elasticsearch + +# Initialize Elasticsearch client +es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) + +# Create an index with the appropriate mapping +mapping = { + "mappings": { + "properties": { + "title": {"type": "text"}, + "url": {"type": "keyword"}, + "paragraph_id": {"type": "integer"}, + "content": {"type": "text"}, + "embedding": {"type": "dense_vector", "dims": 384} # Adjust the dims based on the model output dimension + } + } +} + +es.indices.create(index="documentation_files", body=mapping, ignore=400) \ No newline at end of file diff --git a/data-import/webpages.py b/data-import/webpages.py new file mode 100644 index 0000000..dbd625d --- /dev/null +++ b/data-import/webpages.py @@ -0,0 +1,50 @@ +import requests +from bs4 import BeautifulSoup +from elasticsearch import Elasticsearch +from sentence_transformers import SentenceTransformer + +# Initialize Elasticsearch client +es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) + +# Load the sentence transformer model +model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3') + +# Define a function to fetch and parse webpages +def fetch_and_parse(url): + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract webpage title + title = soup.title.string if soup.title else "No title found" + + # Extract and store paragraphs as separate documents + paragraphs = [] + for i, p in enumerate(soup.find_all('p')): + content = p.get_text().strip() + # Compute the paragraph embedding + embedding = model.encode(content, convert_to_tensor=True, show_progress_bar=False).tolist() + + paragraphs.append({ + 'url': url, + 'title': title, + 'paragraph_id': i, + 'content': content, + 'embedding': embedding + }) + + return paragraphs + +# List of URLs to index +urls = [ + "https://en.wikipedia.org/wiki/Web_scraping", + "https://en.wikipedia.org/wiki/Elasticsearch", + "https://en.wikipedia.org/wiki/Python_(programming_language)" +] + +# Index the webpages in Elasticsearch +for url in urls: + parsed_pages = fetch_and_parse(url) + + for page in parsed_pages: + # Index the paragraph as a separate document in Elasticsearch + es.index(index='documentation_files', doc_type='_doc', body=page) diff --git a/docker-compose.yml b/docker-compose.yml index 01f5b69..506fab1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,6 +26,14 @@ services: ports: - 5601:5601 + flask_app: + build: ./flask_app + container_name: flask_app + depends_on: + - elasticsearch + ports: + - "5001:5000" + volumes: esdata: - driver: local \ No newline at end of file + driver: local diff --git a/document-imports/markdown.py b/document-imports/markdown.py deleted file mode 100644 index aed7afd..0000000 --- a/document-imports/markdown.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -from elasticsearch import Elasticsearch - -# Load the Markdown file -markdown_file = 'README.md' -with open(markdown_file, 'r', encoding='utf-8') as f: - markdown_content = f.read() - -# Extract the title (assuming the first heading is the title) -title_lines = [line for line in markdown_content.splitlines() if line.startswith(('#', '##', '###', '####', '#####', '######'))] -title = title_lines[0] if title_lines else 'Untitled' - -# Preprocess the data: create a document for Elasticsearch -document = { - 'file': os.path.abspath(markdown_file), - 'title': title, - 'content': markdown_content, -} -print(document) -# Index the document in Elasticsearch -es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) -index_name = 'markdown_files' -doc_type = '_doc' -es.index(index=index_name, doc_type=doc_type, body=document) diff --git a/flask_app/Dockerfile b/flask_app/Dockerfile new file mode 100644 index 0000000..1bf3278 --- /dev/null +++ b/flask_app/Dockerfile @@ -0,0 +1,20 @@ +# Use the official Python image as the base image +FROM python:3.9 + +# Set the working directory +WORKDIR /app + +# Copy the requirements.txt file into the container +COPY requirements.txt . + +# Install the dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application files into the container +COPY . . + +# Expose the port the app runs on +EXPOSE 5000 + +# Run the Flask application +CMD ["flask", "run", "--host", "0.0.0.0"] diff --git a/flask_app/app.py b/flask_app/app.py new file mode 100644 index 0000000..32da89a --- /dev/null +++ b/flask_app/app.py @@ -0,0 +1,82 @@ +from elasticsearch import Elasticsearch, RequestsHttpConnection +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +from flask import Flask, render_template, request +from transformers import pipeline +from sentence_transformers import SentenceTransformer + +app = Flask(__name__) + + +model_name = "deepset/roberta-base-squad2" + +nlp = pipeline('question-answering', model=model_name, + tokenizer=model_name, padding=True, truncation=True) + +# Load the sentence transformer model +model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3') + +# Define a retry strategy for the Elasticsearch client +retry_strategy = Retry( + total=3, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "POST"], + backoff_factor=1 +) + +adapter = HTTPAdapter(max_retries=retry_strategy) + +# Initialize the Elasticsearch client with the retry strategy +es = Elasticsearch( + [{'host': 'elasticsearch', 'port': 9200}], + connection_class=RequestsHttpConnection, + max_retries=3 +) +es.transport.connection_pool.adapter = adapter + + + +@app.route("/", methods=["GET", "POST"]) +def index(): + if request.method == "POST": + question = request.form["question"] + + # Compute the question embedding + question_embedding = model.encode( + question, convert_to_tensor=True, show_progress_bar=False).tolist() + + index_name = 'documentation_files' + + query = { + "query": { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", + "params": {"query_vector": question_embedding} + } + } + }, + "size": 1 + } + response = es.search(index=index_name, body=query) + passages = [hit['_source'] for hit in response['hits']['hits']] + + answers = [] + for passage in passages: + QA_input = { + 'question': question, + 'context': passage['title'] + ' ' + passage['content'] + } + answer = nlp(QA_input)['answer'] + + answers.append([answer, passage]) + + return render_template("index.html", question=question, answers=answers) + else: + return render_template("index.html", question=None, answers=None) + + +if __name__ == "__main__": + app.run(debug=True) diff --git a/flask_app/requirements.txt b/flask_app/requirements.txt new file mode 100644 index 0000000..5b6bbb8 --- /dev/null +++ b/flask_app/requirements.txt @@ -0,0 +1,5 @@ +Flask==2.1.1 +transformers==4.12.2 +elasticsearch==7.15.2 +torch +sentence-transformers \ No newline at end of file diff --git a/flask_app/templates/index.html b/flask_app/templates/index.html new file mode 100644 index 0000000..d8d768d --- /dev/null +++ b/flask_app/templates/index.html @@ -0,0 +1,38 @@ + + + + + + Question-Answering Search + + + +
+

Question-Answering Search

+
+
+ + +
+ +
+ {% if question and answers %} +

Question: {{ question }}

+

The answer is {{ answers[0][0] }}

+
+

Sources:

+
    + {% for answer in answers %} +
  1. +
    +

    {{ answer[1]['title']}}

    +

    {{answer[1]['content']}}

    +
    +
    +
  2. + {% endfor %} +
+ {% endif %} +
+ + \ No newline at end of file diff --git a/ml-search.py b/ml-search.py deleted file mode 100644 index 6e1b504..0000000 --- a/ml-search.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -from transformers import AutoTokenizer, AutoModelForQuestionAnswering -from elasticsearch import Elasticsearch - -# Load the pre-trained Transformer model and tokenizer -model_name = "distilbert-base-uncased-distilled-squad" -tokenizer = AutoTokenizer.from_pretrained(model_name) -model = AutoModelForQuestionAnswering.from_pretrained(model_name) - -# Initialize the Elasticsearch client -es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) - -# Define the index you want to query -index_name = 'markdown_files' - -# Define the question -question = "What is the cms module?" - -# Define the query to find relevant documents -query = { - "query": { - "simple_query_string": { - "query": question, - "default_operator": "and", - "fields": ["title", "content"] - } - }, - "size": 3 # Limit the number of documents to retrieve -} - -# Execute the query and get the results -response = es.search(index=index_name, body=query) - -# Extract relevant passages from the returned documents -passages = [hit['_source']['content'] for hit in response['hits']['hits']] - -# Use the Transformer model to answer the question -max_answer_length = 30 - -for passage in passages: - inputs = tokenizer(question, passage, return_tensors='pt', max_length=512, truncation=True) - outputs = model(**inputs) - answer_start = torch.argmax(outputs.start_logits) - answer_end = torch.argmax(outputs.end_logits) - input_ids = inputs["input_ids"][0].tolist() - answer_tokens = input_ids[answer_start:answer_end + 1] - answer = tokenizer.decode(answer_tokens, skip_special_tokens=True) - - print(f"Passage: {passage[:200]}...") - print(f"Answer: {answer}\n") diff --git a/requirements.txt b/requirements.txt index 551cfd0..5fc7a33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ requests==2.26.0 elasticsearch==7.16.0 transformers -torch \ No newline at end of file +torch +beautifulsoup4 +sentence-transformers \ No newline at end of file diff --git a/search.py b/search.py deleted file mode 100644 index 659869b..0000000 --- a/search.py +++ /dev/null @@ -1,34 +0,0 @@ -from elasticsearch import Elasticsearch - -# Initialize the Elasticsearch client -es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) - -# Define the index you want to query -index_name = 'markdown_files' - -# Define the question -question = "What is the cms module?" - -# Define the query -query = { - "query": { - "simple_query_string": { - "query": question, - "default_operator": "and", - "fields": ["title", "content"] - } - } -} - -# Execute the query and get the results -response = es.search(index=index_name, body=query) - -# Print the number of hits (matching documents) -print(f"Found {response['hits']['total']['value']} documents") - -# Print the documents -for hit in response['hits']['hits']: - print(f"Document ID: {hit['_id']}") - print(f"Document Score: {hit['_score']}") - print(f"Document Title: {hit['_source']['title']}") - print(f"Document Content:\n{hit['_source']['content']}\n")