From ae582f2d6112302ad41273750b41cb8eee7fb879 Mon Sep 17 00:00:00 2001
From: Achilleas Moraitis <achimoraites@yahoo.gr>
Date: Fri, 17 Mar 2023 14:58:21 +0100
Subject: [PATCH] feat: qa with semantic search

---
 README.md                      | 25 ++++++++++-
 data-import/create_index.py    | 19 ++++++++
 data-import/webpages.py        | 50 +++++++++++++++++++++
 docker-compose.yml             | 10 ++++-
 document-imports/markdown.py   | 24 ----------
 flask_app/Dockerfile           | 20 +++++++++
 flask_app/app.py               | 82 ++++++++++++++++++++++++++++++++++
 flask_app/requirements.txt     |  5 +++
 flask_app/templates/index.html | 38 ++++++++++++++++
 ml-search.py                   | 50 ---------------------
 requirements.txt               |  4 +-
 search.py                      | 34 --------------
 12 files changed, 249 insertions(+), 112 deletions(-)
 create mode 100644 data-import/create_index.py
 create mode 100644 data-import/webpages.py
 delete mode 100644 document-imports/markdown.py
 create mode 100644 flask_app/Dockerfile
 create mode 100644 flask_app/app.py
 create mode 100644 flask_app/requirements.txt
 create mode 100644 flask_app/templates/index.html
 delete mode 100644 ml-search.py
 delete mode 100644 search.py

diff --git a/README.md b/README.md
index 5acd313..7663ca9 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,13 @@
 
 Documentation never has been so fun 😎
 
+This is a question and answering system that uses semantic search and a qa ml model
+to give an answer to the user. 
+
+The system will give the answer and it's source as well.
+
+https://user-images.githubusercontent.com/4193340/226748890-2415ca7e-e13a-4e5e-9f82-c56a1a6408cc.mp4
+
 ## Setup
 
 ```bash
@@ -19,8 +26,22 @@ pip install -r requirements.txt
 ```
 
 ```bash
-docker-compose up -d
+docker-compose up -d build
 
 ```
 
-Kibana link http://localhost:5601
\ No newline at end of file
+Kibana link http://localhost:5601
+The Flask application should be accessible at http://localhost:5001.
+
+## Elastic Search
+The first time you will need to add the index for storing the documents
+
+```bash
+python /data-import/create_index.py
+```
+
+To import some sample pages you can run
+
+```bash
+python /data-import/webpages.py
+```
diff --git a/data-import/create_index.py b/data-import/create_index.py
new file mode 100644
index 0000000..bdd4c5c
--- /dev/null
+++ b/data-import/create_index.py
@@ -0,0 +1,19 @@
+from elasticsearch import Elasticsearch
+
+# Initialize Elasticsearch client
+es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
+
+# Create an index with the appropriate mapping
+mapping = {
+    "mappings": {
+        "properties": {
+            "title": {"type": "text"},
+            "url": {"type": "keyword"},
+            "paragraph_id": {"type": "integer"},
+            "content": {"type": "text"},
+            "embedding": {"type": "dense_vector", "dims": 384}  # Adjust the dims based on the model output dimension
+        }
+    }
+}
+
+es.indices.create(index="documentation_files", body=mapping, ignore=400)
\ No newline at end of file
diff --git a/data-import/webpages.py b/data-import/webpages.py
new file mode 100644
index 0000000..dbd625d
--- /dev/null
+++ b/data-import/webpages.py
@@ -0,0 +1,50 @@
+import requests
+from bs4 import BeautifulSoup
+from elasticsearch import Elasticsearch
+from sentence_transformers import SentenceTransformer
+
+# Initialize Elasticsearch client
+es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
+
+# Load the sentence transformer model
+model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3')
+
+# Define a function to fetch and parse webpages
+def fetch_and_parse(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    
+    # Extract webpage title
+    title = soup.title.string if soup.title else "No title found"
+    
+    # Extract and store paragraphs as separate documents
+    paragraphs = []
+    for i, p in enumerate(soup.find_all('p')):
+        content = p.get_text().strip()
+        # Compute the paragraph embedding
+        embedding = model.encode(content, convert_to_tensor=True, show_progress_bar=False).tolist()
+
+        paragraphs.append({
+            'url': url,
+            'title': title,
+            'paragraph_id': i,
+            'content': content,
+            'embedding': embedding
+        })
+    
+    return paragraphs
+
+# List of URLs to index
+urls = [
+    "https://en.wikipedia.org/wiki/Web_scraping",
+    "https://en.wikipedia.org/wiki/Elasticsearch",
+    "https://en.wikipedia.org/wiki/Python_(programming_language)"
+]
+
+# Index the webpages in Elasticsearch
+for url in urls:
+    parsed_pages = fetch_and_parse(url)
+    
+    for page in parsed_pages:
+        # Index the paragraph as a separate document in Elasticsearch
+        es.index(index='documentation_files', doc_type='_doc', body=page)
diff --git a/docker-compose.yml b/docker-compose.yml
index 01f5b69..506fab1 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -26,6 +26,14 @@ services:
     ports:
       - 5601:5601
 
+  flask_app:
+    build: ./flask_app
+    container_name: flask_app
+    depends_on:
+      - elasticsearch
+    ports:
+      - "5001:5000"
+
 volumes:
   esdata:
-    driver: local
\ No newline at end of file
+    driver: local
diff --git a/document-imports/markdown.py b/document-imports/markdown.py
deleted file mode 100644
index aed7afd..0000000
--- a/document-imports/markdown.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import os
-from elasticsearch import Elasticsearch
-
-# Load the Markdown file
-markdown_file = 'README.md'
-with open(markdown_file, 'r', encoding='utf-8') as f:
-    markdown_content = f.read()
-
-# Extract the title (assuming the first heading is the title)
-title_lines = [line for line in markdown_content.splitlines() if line.startswith(('#', '##', '###', '####', '#####', '######'))]
-title = title_lines[0] if title_lines else 'Untitled'
-
-# Preprocess the data: create a document for Elasticsearch
-document = {
-    'file': os.path.abspath(markdown_file),
-    'title': title,
-    'content': markdown_content,
-}
-print(document)
-# Index the document in Elasticsearch
-es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
-index_name = 'markdown_files'
-doc_type = '_doc'
-es.index(index=index_name, doc_type=doc_type, body=document)
diff --git a/flask_app/Dockerfile b/flask_app/Dockerfile
new file mode 100644
index 0000000..1bf3278
--- /dev/null
+++ b/flask_app/Dockerfile
@@ -0,0 +1,20 @@
+# Use the official Python image as the base image
+FROM python:3.9
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the requirements.txt file into the container
+COPY requirements.txt .
+
+# Install the dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application files into the container
+COPY . .
+
+# Expose the port the app runs on
+EXPOSE 5000
+
+# Run the Flask application
+CMD ["flask", "run", "--host", "0.0.0.0"]
diff --git a/flask_app/app.py b/flask_app/app.py
new file mode 100644
index 0000000..32da89a
--- /dev/null
+++ b/flask_app/app.py
@@ -0,0 +1,82 @@
+from elasticsearch import Elasticsearch, RequestsHttpConnection
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
+from flask import Flask, render_template, request
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+
+app = Flask(__name__)
+
+
+model_name = "deepset/roberta-base-squad2"
+
+nlp = pipeline('question-answering', model=model_name,
+               tokenizer=model_name, padding=True, truncation=True)
+
+# Load the sentence transformer model
+model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3')
+
+# Define a retry strategy for the Elasticsearch client
+retry_strategy = Retry(
+    total=3,
+    status_forcelist=[429, 500, 502, 503, 504],
+    allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "POST"],
+    backoff_factor=1
+)
+
+adapter = HTTPAdapter(max_retries=retry_strategy)
+
+# Initialize the Elasticsearch client with the retry strategy
+es = Elasticsearch(
+    [{'host': 'elasticsearch', 'port': 9200}],
+    connection_class=RequestsHttpConnection,
+    max_retries=3
+)
+es.transport.connection_pool.adapter = adapter
+
+
+
+@app.route("/", methods=["GET", "POST"])
+def index():
+    if request.method == "POST":
+        question = request.form["question"]
+
+        # Compute the question embedding
+        question_embedding = model.encode(
+            question, convert_to_tensor=True, show_progress_bar=False).tolist()
+
+        index_name = 'documentation_files'
+
+        query = {
+            "query": {
+                "script_score": {
+                    "query": {"match_all": {}},
+                    "script": {
+                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
+                        "params": {"query_vector": question_embedding}
+                    }
+                }
+            },
+            "size": 1
+        }
+        response = es.search(index=index_name, body=query)
+        passages = [hit['_source'] for hit in response['hits']['hits']]
+
+        answers = []
+        for passage in passages:
+            QA_input = {
+                'question': question,
+                'context': passage['title'] + ' ' + passage['content']
+            }
+            answer = nlp(QA_input)['answer']
+
+            answers.append([answer, passage])
+
+        return render_template("index.html", question=question, answers=answers)
+    else:
+        return render_template("index.html", question=None, answers=None)
+
+
+if __name__ == "__main__":
+    app.run(debug=True)
diff --git a/flask_app/requirements.txt b/flask_app/requirements.txt
new file mode 100644
index 0000000..5b6bbb8
--- /dev/null
+++ b/flask_app/requirements.txt
@@ -0,0 +1,5 @@
+Flask==2.1.1
+transformers==4.12.2
+elasticsearch==7.15.2
+torch
+sentence-transformers
\ No newline at end of file
diff --git a/flask_app/templates/index.html b/flask_app/templates/index.html
new file mode 100644
index 0000000..d8d768d
--- /dev/null
+++ b/flask_app/templates/index.html
@@ -0,0 +1,38 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Question-Answering Search</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet">
+  </head>
+  <body>
+    <div class="container mt-5">
+      <h1 class="mb-4">Question-Answering Search</h1>
+      <form action="/" method="post">
+        <div class="mb-3">
+          <label for="question" class="form-label">Enter your question:</label>
+          <input type="text" class="form-control" id="question" name="question" required>
+        </div>
+        <button type="submit" class="btn btn-primary">Search</button>
+      </form>
+      {% if question and answers %}
+        <h2 class="mt-5">Question: {{ question }}</h2>
+        <p> The answer is <strong>{{ answers[0][0] }} </strong></p>
+        <br />
+        <p>Sources:</p>
+        <ol>
+          {% for answer in answers %}
+            <li>
+              <div>
+                <h3> <a href="{{answer[1]['url']}}"> {{ answer[1]['title']}} </a>  </h3>
+                <p>{{answer[1]['content']}}</p>
+                <hr>
+              </div>
+            </li>
+          {% endfor %}
+        </ol>
+      {% endif %}
+    </div>
+  </body>
+</html>
\ No newline at end of file
diff --git a/ml-search.py b/ml-search.py
deleted file mode 100644
index 6e1b504..0000000
--- a/ml-search.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering
-from elasticsearch import Elasticsearch
-
-# Load the pre-trained Transformer model and tokenizer
-model_name = "distilbert-base-uncased-distilled-squad"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-
-# Initialize the Elasticsearch client
-es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
-
-# Define the index you want to query
-index_name = 'markdown_files'
-
-# Define the question
-question = "What is the cms module?"
-
-# Define the query to find relevant documents
-query = {
-    "query": {
-        "simple_query_string": {
-            "query": question,
-            "default_operator": "and",
-            "fields": ["title", "content"]
-        }
-    },
-    "size": 3  # Limit the number of documents to retrieve
-}
-
-# Execute the query and get the results
-response = es.search(index=index_name, body=query)
-
-# Extract relevant passages from the returned documents
-passages = [hit['_source']['content'] for hit in response['hits']['hits']]
-
-# Use the Transformer model to answer the question
-max_answer_length = 30
-
-for passage in passages:
-    inputs = tokenizer(question, passage, return_tensors='pt', max_length=512, truncation=True)
-    outputs = model(**inputs)
-    answer_start = torch.argmax(outputs.start_logits)
-    answer_end = torch.argmax(outputs.end_logits)
-    input_ids = inputs["input_ids"][0].tolist()
-    answer_tokens = input_ids[answer_start:answer_end + 1]
-    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
-
-    print(f"Passage: {passage[:200]}...")
-    print(f"Answer: {answer}\n")
diff --git a/requirements.txt b/requirements.txt
index 551cfd0..5fc7a33 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,6 @@
 requests==2.26.0
 elasticsearch==7.16.0
 transformers
-torch
\ No newline at end of file
+torch
+beautifulsoup4
+sentence-transformers
\ No newline at end of file
diff --git a/search.py b/search.py
deleted file mode 100644
index 659869b..0000000
--- a/search.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from elasticsearch import Elasticsearch
-
-# Initialize the Elasticsearch client
-es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
-
-# Define the index you want to query
-index_name = 'markdown_files'
-
-# Define the question
-question = "What is the cms module?"
-
-# Define the query
-query = {
-    "query": {
-        "simple_query_string": {
-            "query": question,
-            "default_operator": "and",
-            "fields": ["title", "content"]
-        }
-    }
-}
-
-# Execute the query and get the results
-response = es.search(index=index_name, body=query)
-
-# Print the number of hits (matching documents)
-print(f"Found {response['hits']['total']['value']} documents")
-
-# Print the documents
-for hit in response['hits']['hits']:
-    print(f"Document ID: {hit['_id']}")
-    print(f"Document Score: {hit['_score']}")
-    print(f"Document Title: {hit['_source']['title']}")
-    print(f"Document Content:\n{hit['_source']['content']}\n")