-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinitialize.py
58 lines (42 loc) · 1.18 KB
/
initialize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
print("Starting initialize.py")
# %% [markdown]
# # Logging
# %%
import logging
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
# %% [markdown]
# # Configurations
# %%
import os
os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# %% [markdown]
# # Document Store
# %%
import os
from haystack.document_stores import ElasticsearchDocumentStore
# Get the host where Elasticsearch is running
document_store = ElasticsearchDocumentStore(host="elasticsearch", username="", password="", index="document")
# %% [markdown]
# # Preprocessing of documents
# %%
import pandas as pd
file_path = "/opt/data/segments.csv"
df = pd.read_csv(file_path)
# cleanup
df.fillna(value="", inplace=True)
df["text"] = df["text"].apply(lambda x: x.strip())
df = df.rename(columns={"text": "content"})
# print(df.head())
print(df.count)
# %%
docs = df.to_dict(orient="records")
from pprint import pprint
# pprint(docs[:3])
# %% [markdown]
# # Write documents
# %%
# docs = docs[:100] # max for testing
document_store.delete_documents()
document_store.write_documents(docs)