Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
alebjanes committed Apr 1, 2024
2 parents 69c4aa8 + e1410c6 commit 8aa2373
Show file tree
Hide file tree
Showing 31 changed files with 12,945 additions and 633 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
log.txt
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ This repository contains scripts for a chatbot that leverages artificial intelli
5. **`table_selection/`**
- All scripts needed to lookup and manage the relevant cube that contains the data needed to answer the user's query.


## General Workflow

### 1. Table Selection
Expand Down Expand Up @@ -74,6 +73,7 @@ This repository contains scripts for a chatbot that leverages artificial intelli

6. The data is retrieved from the API using the `fetch_data()` method and stored in a pandas dataframe.


### 3. Data Analysis/Processing

- [In progress...] Data Analysis is done with LangChain, using the pandas dataframe agent.
Expand Down Expand Up @@ -107,6 +107,27 @@ In order to add one cube, the steps are:
- Add each hierarchy separately, filling the following fields for each:
```json
{
"name": "Millions Of Dollars",
"description": "value in millions of dollars of a certain shipment."
}
```
- dimensions
- Add each hierarchy separately, filling the following fields for each:
```json
{
"name": "Time",
"description": "Periodicity of the data (monthly or annual).",
"hierarchies": [
{
"name": "Month and Year",
"description": "'Month and Year' has the format YYYYMM (example March of 2015 is 201503)",
"levels": [
"Year",
"Month and Year"
]
}
]

"name": "Time",
"description": "Periodicity of the data (monthly or annual).",
"hierarchies": [
Expand Down
162 changes: 146 additions & 16 deletions api/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,146 @@
dynaconf
fastapi
jinja2
langchain==0.0.218
openai==0.27.4
pandas
psycopg2-binary
python-dotenv
python-multipart
sentence_transformers
sqlalchemy
sqlmodel
tabulate
tiktoken==0.3.2
typer
uvicorn
aiohttp==3.9.3
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.3.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.2.0
Automat==22.10.0
Babel==2.14.0
beautifulsoup4==4.12.3
bleach==6.1.0
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
comm==0.2.2
constantly==23.10.4
cryptography==42.0.5
cssselect==1.2.0
dataclasses-json==0.6.4
debugpy==1.8.1
decorator==5.1.1
defusedxml==0.7.1
distro==1.9.0
executing==2.0.1
fastjsonschema==2.19.1
filelock==3.13.1
fqdn==1.5.1
frozenlist==1.4.1
greenlet==3.0.3
h11==0.14.0
httpcore==1.0.4
httpx==0.27.0
hyperlink==21.0.0
idna==3.6
incremental==22.10.0
iniconfig==2.0.0
ipykernel==6.29.3
ipython==8.22.2
isoduration==20.11.0
itemadapter==0.8.0
itemloaders==1.1.0
jedi==0.19.1
Jinja2==3.1.3
jmespath==1.0.1
json5==0.9.22
jsonpatch==1.33
jsonpointer==2.4
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
jupyter-events==0.9.1
jupyter-lsp==2.2.4
jupyter_client==8.6.1
jupyter_core==5.7.2
jupyter_server==2.13.0
jupyter_server_terminals==0.5.3
jupyterlab==4.1.5
jupyterlab_pygments==0.3.0
jupyterlab_server==2.25.4
langchain==0.1.12
langchain-community==0.0.28
langchain-core==0.1.32
langchain-openai==0.0.8
langchain-text-splitters==0.0.1
langsmith==0.1.26
lxml==5.1.0
MarkupSafe==2.1.5
marshmallow==3.21.1
matplotlib-inline==0.1.6
mistune==3.0.2
multidict==6.0.5
mypy-extensions==1.0.0
nbclient==0.10.0
nbconvert==7.16.2
nbformat==5.10.3
nest-asyncio==1.6.0
notebook_shim==0.2.4
numpy==1.26.4
openai==1.14.0
orjson==3.9.15
overrides==7.7.0
packaging==23.2
pandocfilters==1.5.1
parsel==1.9.0
parso==0.8.3
pexpect==4.9.0
platformdirs==4.2.0
pluggy==1.4.0
prometheus_client==0.20.0
prompt-toolkit==3.0.43
Protego==0.3.0
psutil==5.9.8
ptyprocess==0.7.0
pure-eval==0.2.2
pyasn1==0.5.1
pyasn1-modules==0.3.0
pycparser==2.21
pydantic==2.6.4
pydantic_core==2.16.3
PyDispatcher==2.0.7
Pygments==2.17.2
pyOpenSSL==24.1.0
pytest==8.1.1
python-dateutil==2.9.0.post0
python-json-logger==2.0.7
PyYAML==6.0.1
pyzmq==25.1.2
queuelib==1.6.2
referencing==0.33.0
regex==2023.12.25
requests==2.31.0
requests-file==2.0.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rpds-py==0.18.0
Scrapy==2.11.1
Send2Trash==1.8.2
service-identity==24.1.0
six==1.16.0
sniffio==1.3.1
soupsieve==2.5
SQLAlchemy==2.0.28
stack-data==0.6.3
tenacity==8.2.3
terminado==0.18.1
tiktoken==0.6.0
tinycss2==1.2.1
tldextract==5.1.1
tornado==6.4
tqdm==4.66.2
traitlets==5.14.2
Twisted==24.3.0
types-python-dateutil==2.9.0.20240315
typing-inspect==0.9.0
typing_extensions==4.10.0
uri-template==1.3.0
urllib3==2.2.1
w3lib==2.1.2
wcwidth==0.2.13
webcolors==1.13
webencodings==0.5.1
websocket-client==1.7.0
yarl==1.9.4
zope.interface==6.2
4 changes: 4 additions & 0 deletions api/setup/schema_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import xml.etree.ElementTree as ET

def parse_xml_to_json(xml_file):
"""
Parses XML schema to custom json format.
"""

tree = ET.parse(xml_file)
root = tree.getroot()

Expand Down
8 changes: 5 additions & 3 deletions api/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@
TESSERACT_API = getenv("TESSERACT_API")

# Mondrian Connection
MONDRIAN_API = getenv("MONDRIAN_API")

MONDRIAN_API = getenv('MONDRIAN_API')

# Files Directories
FEW_SHOT_PATH = getenv("FEW_SHOT_PATH")
TABLES_PATH = getenv("TABLES_PATH")
TABLES_PATH = getenv('TABLES_PATH')
FEW_SHOT_PATH = getenv('FEW_SHOT_PATH')

51 changes: 43 additions & 8 deletions api/src/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from fastapi import FastAPI

from config import TABLES_PATH
from fastapi.responses import StreamingResponse
from utils.app import get_api
from wrapper.lanbot import Langbot
from langchain_core.runnables import RunnableLambda, chain
import time
import json
from config import TABLES_PATH

# fastapi instance declaration
app = FastAPI()
Expand All @@ -14,15 +18,46 @@ async def root():
"status": "ok"
}

@app.get("/wrap/{query}")
async def wrap(query):
return StreamingResponse(Langbot(query, get_api, [], TABLES_PATH), media_type="application/json")


@app.get("/query/{query}")
async def read_item(query: str):
api_url, data, text_response = get_api(query, TABLES_PATH)

return {
"query":
{
"question": query,
"answer": text_response,
"url": api_url
"query":
{
"question": query,
"answer": text_response,
"url": api_url
}
}
}

#test
@chain
def just(input):
for w in input['input'].split(' '):
yield w
#return {'data': 'abcd', 'data2':'wxyz'}

#@chain
def fn(input):
print(input)
yield json.dumps({'msg':input})
time.sleep(4)
yield json.dumps({'msg':input})

def fn2():
chain = just | fn
time.sleep(2)
for val in fn({'input':'the jumping flying fox'}):
yield val

@app.get("/num/")
def num():
return StreamingResponse(fn2(), media_type="application/json")


5 changes: 5 additions & 0 deletions api/src/test/wrapper_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from src.wrapper.lanbot import Langbot


def test_basic():
assert Langbot('hi', lambda x: print(x), )
6 changes: 4 additions & 2 deletions api/src/utils/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,7 @@ def get_api(query, TABLES_PATH):
log_apicall(query, api_url, response, variables, measures, cuts, table, duration)
return api_url, data, response

TABLES_PATH = getenv('TABLES_PATH')
get_api('How much did the CPI of fresh fruits change between 2019 and 2021', TABLES_PATH)
if __name__ == "__main__":
TABLES_PATH = getenv('TABLES_PATH')
get_api('How much did the CPI of fresh fruits change between 2019 and 2021', TABLES_PATH)

Empty file.
Empty file.
6 changes: 4 additions & 2 deletions api/src/utils/data_analysis/data_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from config import OPENAI_KEY
from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain_community.chat_models import ChatOpenAI


def agent_answer(df, natural_language_query):

Expand All @@ -21,6 +22,7 @@ def agent_answer(df, natural_language_query):
)

llm = ChatOpenAI(model_name='gpt-4-1106-preview', temperature=0, openai_api_key=OPENAI_KEY)

agent = create_pandas_dataframe_agent(llm, df, verbose=True)
response = agent.run(prompt)

Expand Down
40 changes: 40 additions & 0 deletions api/src/utils/helpers/old/cube_to_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pandas as pd

from config import POSTGRES_ENGINE
from sentence_transformers import SentenceTransformer

def embedding(dataframe, column):
"""
Creates embeddings for text in the passed column
"""
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

model_embeddings = model.encode(dataframe[column].to_list())
dataframe['embedding'] = model_embeddings.tolist()

return dataframe


def create_table():
POSTGRES_ENGINE.execute("CREATE TABLE IF NOT EXISTS datausa_tables.cubes (table_name text, table_description text, embedding vector(384))")
return


def load_data_to_db(df):

print(df.head())

df_embeddings = embedding(df, 'table_description')
df_embeddings.to_sql('cubes', con=POSTGRES_ENGINE, if_exists='append', index=False, schema='datausa_tables')

return


df = pd.DataFrame()

df["table_name"] = ["Data_USA_House_election"]
df['table_description'] = ["Table 'Data_USA_House_election' contains House election data, including number of votes by candidate, party and state."]

create_table()

load_data_to_db(df)
Loading

0 comments on commit 8aa2373

Please sign in to comment.