This repository has been archived by the owner on Feb 12, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ace9563
commit 5c25ae5
Showing
8 changed files
with
170 additions
and
201 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
.DS_Store | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Simple Question Answering ML System | ||
|
||
Documentation never has been so fun 😎 | ||
|
||
## Setup | ||
|
||
```bash | ||
python -m venv venv | ||
|
||
# Linux / MacOS | ||
source venv/bin/activate | ||
|
||
# Windows | ||
.\venv\Scripts\activate | ||
|
||
|
||
pip install -r requirements.txt | ||
|
||
``` | ||
|
||
```bash | ||
docker-compose up -d | ||
|
||
``` | ||
|
||
Kibana link http://localhost:5601 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
version: '3.8' | ||
|
||
services: | ||
elasticsearch: | ||
image: docker.elastic.co/elasticsearch/elasticsearch:7.16.3 | ||
container_name: elasticsearch | ||
environment: | ||
- discovery.type=single-node | ||
- bootstrap.memory_lock=true | ||
- "ES_JAVA_OPTS=-Xms512m -Xmx512m" | ||
ulimits: | ||
memlock: | ||
soft: -1 | ||
hard: -1 | ||
volumes: | ||
- esdata:/usr/share/elasticsearch/data | ||
ports: | ||
- 9200:9200 | ||
- 9300:9300 | ||
|
||
kibana: | ||
image: docker.elastic.co/kibana/kibana:7.16.3 | ||
container_name: kibana | ||
depends_on: | ||
- elasticsearch | ||
ports: | ||
- 5601:5601 | ||
|
||
volumes: | ||
esdata: | ||
driver: local |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import os | ||
from elasticsearch import Elasticsearch | ||
|
||
# Load the Markdown file | ||
markdown_file = 'README.md' | ||
with open(markdown_file, 'r', encoding='utf-8') as f: | ||
markdown_content = f.read() | ||
|
||
# Extract the title (assuming the first heading is the title) | ||
title_lines = [line for line in markdown_content.splitlines() if line.startswith(('#', '##', '###', '####', '#####', '######'))] | ||
title = title_lines[0] if title_lines else 'Untitled' | ||
|
||
# Preprocess the data: create a document for Elasticsearch | ||
document = { | ||
'file': os.path.abspath(markdown_file), | ||
'title': title, | ||
'content': markdown_content, | ||
} | ||
print(document) | ||
# Index the document in Elasticsearch | ||
es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) | ||
index_name = 'markdown_files' | ||
doc_type = '_doc' | ||
es.index(index=index_name, doc_type=doc_type, body=document) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import torch | ||
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | ||
from elasticsearch import Elasticsearch | ||
|
||
# Load the pre-trained Transformer model and tokenizer | ||
model_name = "distilbert-base-uncased-distilled-squad" | ||
tokenizer = AutoTokenizer.from_pretrained(model_name) | ||
model = AutoModelForQuestionAnswering.from_pretrained(model_name) | ||
|
||
# Initialize the Elasticsearch client | ||
es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) | ||
|
||
# Define the index you want to query | ||
index_name = 'markdown_files' | ||
|
||
# Define the question | ||
question = "What is the cms module?" | ||
|
||
# Define the query to find relevant documents | ||
query = { | ||
"query": { | ||
"simple_query_string": { | ||
"query": question, | ||
"default_operator": "and", | ||
"fields": ["title", "content"] | ||
} | ||
}, | ||
"size": 3 # Limit the number of documents to retrieve | ||
} | ||
|
||
# Execute the query and get the results | ||
response = es.search(index=index_name, body=query) | ||
|
||
# Extract relevant passages from the returned documents | ||
passages = [hit['_source']['content'] for hit in response['hits']['hits']] | ||
|
||
# Use the Transformer model to answer the question | ||
max_answer_length = 30 | ||
|
||
for passage in passages: | ||
inputs = tokenizer(question, passage, return_tensors='pt', max_length=512, truncation=True) | ||
outputs = model(**inputs) | ||
answer_start = torch.argmax(outputs.start_logits) | ||
answer_end = torch.argmax(outputs.end_logits) | ||
input_ids = inputs["input_ids"][0].tolist() | ||
answer_tokens = input_ids[answer_start:answer_end + 1] | ||
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True) | ||
|
||
print(f"Passage: {passage[:200]}...") | ||
print(f"Answer: {answer}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
requests==2.26.0 | ||
elasticsearch==7.16.0 | ||
transformers | ||
torch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from elasticsearch import Elasticsearch | ||
|
||
# Initialize the Elasticsearch client | ||
es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) | ||
|
||
# Define the index you want to query | ||
index_name = 'markdown_files' | ||
|
||
# Define the question | ||
question = "What is the cms module?" | ||
|
||
# Define the query | ||
query = { | ||
"query": { | ||
"simple_query_string": { | ||
"query": question, | ||
"default_operator": "and", | ||
"fields": ["title", "content"] | ||
} | ||
} | ||
} | ||
|
||
# Execute the query and get the results | ||
response = es.search(index=index_name, body=query) | ||
|
||
# Print the number of hits (matching documents) | ||
print(f"Found {response['hits']['total']['value']} documents") | ||
|
||
# Print the documents | ||
for hit in response['hits']['hits']: | ||
print(f"Document ID: {hit['_id']}") | ||
print(f"Document Score: {hit['_score']}") | ||
print(f"Document Title: {hit['_source']['title']}") | ||
print(f"Document Content:\n{hit['_source']['content']}\n") |