Skip to content

Commit

Permalink
working example with elections
Browse files Browse the repository at this point in the history
  • Loading branch information
teotoplak committed Mar 3, 2023
1 parent 7814ec3 commit f2205e8
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 3 deletions.
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ dependencies:
- tiktoken==0.2.0
- beautifulsoup4==4.11.2
- google-cloud-translate==3.11.0
- transformers==4.26.1
Binary file added src/openapi/__pycache__/openapi.cpython-311.pyc
Binary file not shown.
15 changes: 12 additions & 3 deletions src/openapi/openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
import tiktoken
from transformers import GPT2TokenizerFast

openai.api_key_path = ".openai_api_key"

Expand All @@ -21,6 +22,12 @@
"max_tokens": 300,
"model": COMPLETIONS_MODEL,
}
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


def count_tokens(text: str) -> int:
"""count the number of tokens in a string"""
return len(tokenizer.encode(text))


def call_prompt():
Expand Down Expand Up @@ -149,10 +156,12 @@ def answer_query_with_context(

if __name__ == '__main__':
# print(call_prompt())
document_embeddings = load_embeddings("./data/olympics_sections_document_embeddings.csv")
# document_embeddings = load_embeddings("./data/olympics_sections_document_embeddings.csv")
document_embeddings = load_embeddings("./data/embeddings/test-em.csv")
# print(order_document_sections_by_query_similarity("Who won the men's high jump?", document_embeddings)[:5])
df = pd.read_csv('./data/olympics_sections_text.csv')
# df = pd.read_csv('./data/olympics_sections_text.csv')
df = pd.read_csv('./data/embeddings/test.csv')
df = df.set_index(["title", "heading"])
res = answer_query_with_context("Who won the 2020 Summer Olympics men's high jump?", df, document_embeddings)
res = answer_query_with_context("What is Estonia's 200 family policy?", df, document_embeddings)
print(res)

45 changes: 45 additions & 0 deletions src/vectorizer/vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from src.openapi.openapi import count_tokens
import hashlib
import pandas as pd

from src.openapi.openapi import get_embedding
from src.openapi.openapi import vector_similarity


def split_file_into_chunks(file):
with open(file, 'r', encoding='utf-8') as input_file:
text_list = input_file.read().split('\n')
res = []
for text in text_list:
if text == '':
continue
num_tokens = count_tokens(text)
if num_tokens < 10:
continue
if num_tokens > 400:
continue
res.append(("Elections", hash(text), text, num_tokens))
return res


def vectorize_file_to_csv(input, output):
res = split_file_into_chunks(input)
df = pd.DataFrame(res, columns=["title", "heading", "content", "tokens"])
df.to_csv(output, index=False)
return df


def embeddings_to_df(df: pd.DataFrame, file):
embeddings = df["content"].apply(get_embedding).tolist()
embedding_df = pd.DataFrame(embeddings, columns=[f"{i}" for i in range(len(embeddings[0]))])
df = pd.concat([df, embedding_df], axis=1)
del df['content']
del df['tokens']
df.to_csv(file, index=False)
return df


if __name__ == '__main__':
df = vectorize_file_to_csv("./data/elections/english/test.txt", "./data/embeddings/test.csv")
query_vector = embeddings_to_df(df, "./data/embeddings/test-em.csv")
print(query_vector.head(2))
5 changes: 5 additions & 0 deletions tst/vectorizer/test_vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from src.vectorizer.vectorizer import split_file_into_chunks


def test_vectorizer():
split_file_into_chunks()

0 comments on commit f2205e8

Please sign in to comment.