feat: basic infra setup

achimoraites · Mar 17, 2023 · 5c25ae5 · 5c25ae5
1 parent ace9563
commit 5c25ae5
Show file tree

Hide file tree

Showing 8 changed files with 170 additions and 201 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+.DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,26 @@
+# Simple Question Answering ML System
+
+Documentation never has been so fun 😎
+
+## Setup
+
+```bash
+python -m venv venv
+
+# Linux / MacOS
+source venv/bin/activate
+
+# Windows
+.\venv\Scripts\activate
+
+
+pip install -r requirements.txt
+
+```
+
+```bash
+docker-compose up -d
+
+```
+
+Kibana link http://localhost:5601
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,31 @@
+version: '3.8'
+
+services:
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.16.3
+    container_name: elasticsearch
+    environment:
+      - discovery.type=single-node
+      - bootstrap.memory_lock=true
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    volumes:
+      - esdata:/usr/share/elasticsearch/data
+    ports:
+      - 9200:9200
+      - 9300:9300
+
+  kibana:
+    image: docker.elastic.co/kibana/kibana:7.16.3
+    container_name: kibana
+    depends_on:
+      - elasticsearch
+    ports:
+      - 5601:5601
+
+volumes:
+  esdata:
+    driver: local
diff --git a/document-imports/markdown.py b/document-imports/markdown.py
@@ -0,0 +1,24 @@
+import os
+from elasticsearch import Elasticsearch
+
+# Load the Markdown file
+markdown_file = 'README.md'
+with open(markdown_file, 'r', encoding='utf-8') as f:
+    markdown_content = f.read()
+
+# Extract the title (assuming the first heading is the title)
+title_lines = [line for line in markdown_content.splitlines() if line.startswith(('#', '##', '###', '####', '#####', '######'))]
+title = title_lines[0] if title_lines else 'Untitled'
+
+# Preprocess the data: create a document for Elasticsearch
+document = {
+    'file': os.path.abspath(markdown_file),
+    'title': title,
+    'content': markdown_content,
+}
+print(document)
+# Index the document in Elasticsearch
+es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
+index_name = 'markdown_files'
+doc_type = '_doc'
+es.index(index=index_name, doc_type=doc_type, body=document)
diff --git a/ml-search.py b/ml-search.py
@@ -0,0 +1,50 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+from elasticsearch import Elasticsearch
+
+# Load the pre-trained Transformer model and tokenizer
+model_name = "distilbert-base-uncased-distilled-squad"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+
+# Initialize the Elasticsearch client
+es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
+
+# Define the index you want to query
+index_name = 'markdown_files'
+
+# Define the question
+question = "What is the cms module?"
+
+# Define the query to find relevant documents
+query = {
+    "query": {
+        "simple_query_string": {
+            "query": question,
+            "default_operator": "and",
+            "fields": ["title", "content"]
+        }
+    },
+    "size": 3  # Limit the number of documents to retrieve
+}
+
+# Execute the query and get the results
+response = es.search(index=index_name, body=query)
+
+# Extract relevant passages from the returned documents
+passages = [hit['_source']['content'] for hit in response['hits']['hits']]
+
+# Use the Transformer model to answer the question
+max_answer_length = 30
+
+for passage in passages:
+    inputs = tokenizer(question, passage, return_tensors='pt', max_length=512, truncation=True)
+    outputs = model(**inputs)
+    answer_start = torch.argmax(outputs.start_logits)
+    answer_end = torch.argmax(outputs.end_logits)
+    input_ids = inputs["input_ids"][0].tolist()
+    answer_tokens = input_ids[answer_start:answer_end + 1]
+    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
+
+    print(f"Passage: {passage[:200]}...")
+    print(f"Answer: {answer}\n")
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+requests==2.26.0
+elasticsearch==7.16.0
+transformers
+torch
diff --git a/search.py b/search.py
@@ -0,0 +1,34 @@
+from elasticsearch import Elasticsearch
+
+# Initialize the Elasticsearch client
+es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
+
+# Define the index you want to query
+index_name = 'markdown_files'
+
+# Define the question
+question = "What is the cms module?"
+
+# Define the query
+query = {
+    "query": {
+        "simple_query_string": {
+            "query": question,
+            "default_operator": "and",
+            "fields": ["title", "content"]
+        }
+    }
+}
+
+# Execute the query and get the results
+response = es.search(index=index_name, body=query)
+
+# Print the number of hits (matching documents)
+print(f"Found {response['hits']['total']['value']} documents")
+
+# Print the documents
+for hit in response['hits']['hits']:
+    print(f"Document ID: {hit['_id']}")
+    print(f"Document Score: {hit['_score']}")
+    print(f"Document Title: {hit['_source']['title']}")
+    print(f"Document Content:\n{hit['_source']['content']}\n")