Skip to content
This repository has been archived by the owner on Feb 12, 2025. It is now read-only.

Commit

Permalink
feat: basic infra setup
Browse files Browse the repository at this point in the history
  • Loading branch information
achimoraites committed Mar 17, 2023
1 parent ace9563 commit 5c25ae5
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 201 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
201 changes: 0 additions & 201 deletions LICENSE

This file was deleted.

26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Simple Question Answering ML System

Documentation never has been so fun 😎

## Setup

```bash
python -m venv venv

# Linux / MacOS
source venv/bin/activate

# Windows
.\venv\Scripts\activate


pip install -r requirements.txt

```

```bash
docker-compose up -d

```

Kibana link http://localhost:5601
31 changes: 31 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
version: '3.8'

services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.16.3
container_name: elasticsearch
environment:
- discovery.type=single-node
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- esdata:/usr/share/elasticsearch/data
ports:
- 9200:9200
- 9300:9300

kibana:
image: docker.elastic.co/kibana/kibana:7.16.3
container_name: kibana
depends_on:
- elasticsearch
ports:
- 5601:5601

volumes:
esdata:
driver: local
24 changes: 24 additions & 0 deletions document-imports/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
from elasticsearch import Elasticsearch

# Load the Markdown file
markdown_file = 'README.md'
with open(markdown_file, 'r', encoding='utf-8') as f:
markdown_content = f.read()

# Extract the title (assuming the first heading is the title)
title_lines = [line for line in markdown_content.splitlines() if line.startswith(('#', '##', '###', '####', '#####', '######'))]
title = title_lines[0] if title_lines else 'Untitled'

# Preprocess the data: create a document for Elasticsearch
document = {
'file': os.path.abspath(markdown_file),
'title': title,
'content': markdown_content,
}
print(document)
# Index the document in Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
index_name = 'markdown_files'
doc_type = '_doc'
es.index(index=index_name, doc_type=doc_type, body=document)
50 changes: 50 additions & 0 deletions ml-search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from elasticsearch import Elasticsearch

# Load the pre-trained Transformer model and tokenizer
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Initialize the Elasticsearch client
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

# Define the index you want to query
index_name = 'markdown_files'

# Define the question
question = "What is the cms module?"

# Define the query to find relevant documents
query = {
"query": {
"simple_query_string": {
"query": question,
"default_operator": "and",
"fields": ["title", "content"]
}
},
"size": 3 # Limit the number of documents to retrieve
}

# Execute the query and get the results
response = es.search(index=index_name, body=query)

# Extract relevant passages from the returned documents
passages = [hit['_source']['content'] for hit in response['hits']['hits']]

# Use the Transformer model to answer the question
max_answer_length = 30

for passage in passages:
inputs = tokenizer(question, passage, return_tensors='pt', max_length=512, truncation=True)
outputs = model(**inputs)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits)
input_ids = inputs["input_ids"][0].tolist()
answer_tokens = input_ids[answer_start:answer_end + 1]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

print(f"Passage: {passage[:200]}...")
print(f"Answer: {answer}\n")
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
requests==2.26.0
elasticsearch==7.16.0
transformers
torch
34 changes: 34 additions & 0 deletions search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from elasticsearch import Elasticsearch

# Initialize the Elasticsearch client
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

# Define the index you want to query
index_name = 'markdown_files'

# Define the question
question = "What is the cms module?"

# Define the query
query = {
"query": {
"simple_query_string": {
"query": question,
"default_operator": "and",
"fields": ["title", "content"]
}
}
}

# Execute the query and get the results
response = es.search(index=index_name, body=query)

# Print the number of hits (matching documents)
print(f"Found {response['hits']['total']['value']} documents")

# Print the documents
for hit in response['hits']['hits']:
print(f"Document ID: {hit['_id']}")
print(f"Document Score: {hit['_score']}")
print(f"Document Title: {hit['_source']['title']}")
print(f"Document Content:\n{hit['_source']['content']}\n")

0 comments on commit 5c25ae5

Please sign in to comment.