feat: qa with semantic search

achimoraites · Mar 22, 2023 · ae582f2 · ae582f2
1 parent 5c25ae5
commit ae582f2
Show file tree

Hide file tree

Showing 12 changed files with 249 additions and 112 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,13 @@
 
 Documentation never has been so fun 😎
 
+This is a question and answering system that uses semantic search and a qa ml model
+to give an answer to the user. 
+
+The system will give the answer and it's source as well.
+
+https://user-images.githubusercontent.com/4193340/226748890-2415ca7e-e13a-4e5e-9f82-c56a1a6408cc.mp4
+
 ## Setup
 
 ```bash
@@ -19,8 +26,22 @@ pip install -r requirements.txt
 ```
 
 ```bash
-docker-compose up -d
+docker-compose up -d build
 
 ```
 
-Kibana link http://localhost:5601
+Kibana link http://localhost:5601
+The Flask application should be accessible at http://localhost:5001.
+
+## Elastic Search
+The first time you will need to add the index for storing the documents
+
+```bash
+python /data-import/create_index.py
+```
+
+To import some sample pages you can run
+
+```bash
+python /data-import/webpages.py
+```
diff --git a/data-import/create_index.py b/data-import/create_index.py
@@ -0,0 +1,19 @@
+from elasticsearch import Elasticsearch
+
+# Initialize Elasticsearch client
+es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
+
+# Create an index with the appropriate mapping
+mapping = {
+    "mappings": {
+        "properties": {
+            "title": {"type": "text"},
+            "url": {"type": "keyword"},
+            "paragraph_id": {"type": "integer"},
+            "content": {"type": "text"},
+            "embedding": {"type": "dense_vector", "dims": 384}  # Adjust the dims based on the model output dimension
+        }
+    }
+}
+
+es.indices.create(index="documentation_files", body=mapping, ignore=400)
diff --git a/data-import/webpages.py b/data-import/webpages.py
@@ -0,0 +1,50 @@
+import requests
+from bs4 import BeautifulSoup
+from elasticsearch import Elasticsearch
+from sentence_transformers import SentenceTransformer
+
+# Initialize Elasticsearch client
+es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
+
+# Load the sentence transformer model
+model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3')
+
+# Define a function to fetch and parse webpages
+def fetch_and_parse(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # Extract webpage title
+    title = soup.title.string if soup.title else "No title found"
+
+    # Extract and store paragraphs as separate documents
+    paragraphs = []
+    for i, p in enumerate(soup.find_all('p')):
+        content = p.get_text().strip()
+        # Compute the paragraph embedding
+        embedding = model.encode(content, convert_to_tensor=True, show_progress_bar=False).tolist()
+
+        paragraphs.append({
+            'url': url,
+            'title': title,
+            'paragraph_id': i,
+            'content': content,
+            'embedding': embedding
+        })
+
+    return paragraphs
+
+# List of URLs to index
+urls = [
+    "https://en.wikipedia.org/wiki/Web_scraping",
+    "https://en.wikipedia.org/wiki/Elasticsearch",
+    "https://en.wikipedia.org/wiki/Python_(programming_language)"
+]
+
+# Index the webpages in Elasticsearch
+for url in urls:
+    parsed_pages = fetch_and_parse(url)
+
+    for page in parsed_pages:
+        # Index the paragraph as a separate document in Elasticsearch
+        es.index(index='documentation_files', doc_type='_doc', body=page)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -26,6 +26,14 @@ services:
     ports:
       - 5601:5601
 
+  flask_app:
+    build: ./flask_app
+    container_name: flask_app
+    depends_on:
+      - elasticsearch
+    ports:
+      - "5001:5000"
+
 volumes:
   esdata:
-    driver: local
+    driver: local
diff --git a/document-imports/markdown.py b/document-imports/markdown.py
diff --git a/flask_app/Dockerfile b/flask_app/Dockerfile
@@ -0,0 +1,20 @@
+# Use the official Python image as the base image
+FROM python:3.9
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the requirements.txt file into the container
+COPY requirements.txt .
+
+# Install the dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application files into the container
+COPY . .
+
+# Expose the port the app runs on
+EXPOSE 5000
+
+# Run the Flask application
+CMD ["flask", "run", "--host", "0.0.0.0"]
diff --git a/flask_app/app.py b/flask_app/app.py
@@ -0,0 +1,82 @@
+from elasticsearch import Elasticsearch, RequestsHttpConnection
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
+from flask import Flask, render_template, request
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+
+app = Flask(__name__)
+
+
+model_name = "deepset/roberta-base-squad2"
+
+nlp = pipeline('question-answering', model=model_name,
+               tokenizer=model_name, padding=True, truncation=True)
+
+# Load the sentence transformer model
+model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-12-v3')
+
+# Define a retry strategy for the Elasticsearch client
+retry_strategy = Retry(
+    total=3,
+    status_forcelist=[429, 500, 502, 503, 504],
+    allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "POST"],
+    backoff_factor=1
+)
+
+adapter = HTTPAdapter(max_retries=retry_strategy)
+
+# Initialize the Elasticsearch client with the retry strategy
+es = Elasticsearch(
+    [{'host': 'elasticsearch', 'port': 9200}],
+    connection_class=RequestsHttpConnection,
+    max_retries=3
+)
+es.transport.connection_pool.adapter = adapter
+
+
+
+@app.route("/", methods=["GET", "POST"])
+def index():
+    if request.method == "POST":
+        question = request.form["question"]
+
+        # Compute the question embedding
+        question_embedding = model.encode(
+            question, convert_to_tensor=True, show_progress_bar=False).tolist()
+
+        index_name = 'documentation_files'
+
+        query = {
+            "query": {
+                "script_score": {
+                    "query": {"match_all": {}},
+                    "script": {
+                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
+                        "params": {"query_vector": question_embedding}
+                    }
+                }
+            },
+            "size": 1
+        }
+        response = es.search(index=index_name, body=query)
+        passages = [hit['_source'] for hit in response['hits']['hits']]
+
+        answers = []
+        for passage in passages:
+            QA_input = {
+                'question': question,
+                'context': passage['title'] + ' ' + passage['content']
+            }
+            answer = nlp(QA_input)['answer']
+
+            answers.append([answer, passage])
+
+        return render_template("index.html", question=question, answers=answers)
+    else:
+        return render_template("index.html", question=None, answers=None)
+
+
+if __name__ == "__main__":
+    app.run(debug=True)
diff --git a/flask_app/requirements.txt b/flask_app/requirements.txt
@@ -0,0 +1,5 @@
+Flask==2.1.1
+transformers==4.12.2
+elasticsearch==7.15.2
+torch
+sentence-transformers
diff --git a/flask_app/templates/index.html b/flask_app/templates/index.html
@@ -0,0 +1,38 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Question-Answering Search</title>
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
+  </head>
+  <body>
+    <div class="container mt-5">
+      <h1 class="mb-4">Question-Answering Search</h1>
+      <form action="/" method="post">
+        <div class="mb-3">
+          <label for="question" class="form-label">Enter your question:</label>
+          <input type="text" class="form-control" id="question" name="question" required>
+        </div>
+        <button type="submit" class="btn btn-primary">Search</button>
+      </form>
+      {% if question and answers %}
+        <h2 class="mt-5">Question: {{ question }}</h2>
+        <p> The answer is <strong>{{ answers[0][0] }} </strong></p>
+        <br />
+        <p>Sources:</p>
+        <ol>
+          {% for answer in answers %}
+            <li>
+              <div>
+                <h3> <a href="{{answer[1]['url']}}"> {{ answer[1]['title']}} </a>  </h3>
+                <p>{{answer[1]['content']}}</p>
+                <hr>
+              </div>
+            </li>
+          {% endfor %}
+        </ol>
+      {% endif %}
+    </div>
+  </body>
+</html>
diff --git a/ml-search.py b/ml-search.py
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,6 @@
 requests==2.26.0
 elasticsearch==7.16.0
 transformers
-torch
+torch
+beautifulsoup4
+sentence-transformers