-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathes_loader.py
executable file
·306 lines (273 loc) · 12.6 KB
/
es_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/env python3
import argparse
import os
import yaml
import re
from elasticsearch import Elasticsearch, RequestsHttpConnection
from elasticsearch.helpers import streaming_bulk
from requests_aws4auth import AWS4Auth
from botocore.session import Session
from neo4j import GraphDatabase
from bento.common.utils import get_logger, print_config
from icdc_schema import ICDC_Schema, PROPERTIES, ENUM, PROP_ENUM, PROP_TYPE, REQUIRED, DESCRIPTION
from props import Props
logger = get_logger('ESLoader')
OPENSEARCH_DATA = 'opensearch_data'
class ESLoader:
def __init__(self, es_host, neo4j_driver):
self.neo4j_driver = neo4j_driver
timeout_seconds = 60
if 'amazonaws.com' in es_host:
awsauth = AWS4Auth(
refreshable_credentials=Session().get_credentials(),
region='us-east-1',
service='es'
)
self.es_client = Elasticsearch(
hosts=[es_host],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
timeout=timeout_seconds
)
else:
self.es_client = Elasticsearch(hosts=[es_host], timeout=timeout_seconds)
def create_index(self, index_name, mapping):
"""Creates an index in Elasticsearch if one isn't already there."""
return self.es_client.indices.create(
index=index_name,
body={
"settings": {
"number_of_shards": 1,
"index.mapping.nested_objects.limit": 100000
},
"mappings": {
"properties": mapping
},
},
ignore=400,
)
def delete_index(self, index_name):
return self.es_client.indices.delete(index=index_name, ignore_unavailable=True)
def get_data(self, cypher_query: str, fields: dict, skip: int = 0, limit: int = 10000000):
"""Reads data from Neo4j, for each row
yields a single document. This function is passed into the bulk()
helper to create many documents in sequence.
"""
with self.neo4j_driver.session() as session:
result = session.run(cypher_query, {"skip": skip, "limit": limit})
for record in result:
keys = record.keys()
if len(keys) == 1 and keys[0].lower() == OPENSEARCH_DATA.lower():
record = record[record.keys()[0]]
doc = {}
for key in fields:
doc[key] = record[key]
yield doc
def recreate_index(self, index_name, mapping):
logger.info(f'Deleting old index: "{index_name}"')
result = self.delete_index(index_name)
logger.info(result)
logger.info(f'Creating index: "{index_name}"')
result = self.create_index(index_name, mapping)
logger.info(result)
def load(self, index_name, mapping, cypher_queries):
self.recreate_index(index_name, mapping)
logger.info('Indexing data from Neo4j')
total_successes = 0
total_documents = 0
for i, cypher_query in enumerate(cypher_queries):
query = cypher_query.get('query')
if query is None:
raise Exception(f'A query entry is missing for {index_name}')
page_size = cypher_query.get('page_size')
if page_size is None:
page_size = 0
logger.info(f'Executing index query {i+1}/{len(cypher_queries)}')
if page_size > 0:
logger.info(f'Page size is set to {page_size}')
skip = 0
total = page_size
while total == page_size:
successes, total = self.bulk_load(
index_name,
self.get_data(
query, mapping.keys(), skip=skip, limit=page_size
)
)
total_successes += successes
total_documents += total
logger.info(f"Indexing in progress: successfully indexed {total_successes}/{total_documents} documents")
skip += page_size
else:
logger.info(f'Pagination is disabled')
successes, documents = self.bulk_load(index_name, self.get_data(query, mapping.keys()))
total_successes += successes
total_documents += documents
logger.info(f"Indexing completed: successfully indexed {total_successes}/{total_documents} documents")
return total_successes
def bulk_load(self, index_name, data):
successes = 0
total = 0
for ok, _ in streaming_bulk(
client=self.es_client,
index=index_name,
actions=data,
max_retries=2,
initial_backoff=10,
max_backoff=20,
max_chunk_bytes=10485760
):
total += 1
successes += 1 if ok else 0
return successes, total
def load_about_page(self, index_name, mapping, file_name):
logger.info('Indexing content from about page')
if not os.path.isfile(file_name):
raise Exception(f'"{file_name} is not a file!')
self.recreate_index(index_name, mapping)
with open(file_name) as file_obj:
about_file = yaml.safe_load(file_obj)
for page in about_file:
logger.info(f'Indexing about page "{page["page"]}"')
self.index_data(index_name, page, f'page{page["page"]}')
def read_model(self, model_files, prop_file):
for file_name in model_files:
if not os.path.isfile(file_name):
raise Exception(f'"{file_name} is not a file!')
if not os.path.isfile(prop_file):
raise Exception(f'"{prop_file} is not a file!')
self.model = ICDC_Schema(model_files, Props(prop_file))
def load_model(self, index_name, mapping, subtype):
logger.info(f'Indexing data model')
if not self.model:
logger.warning(f'Data model is not loaded, {index_name} will not be loaded!')
return
self.recreate_index(index_name, mapping)
self.bulk_load(index_name, self.get_model_data(subtype))
def get_model_data(self, subtype):
nodes = self.model.nodes
for node_name, obj in nodes.items():
props = obj[PROPERTIES]
if subtype == 'node':
yield {
'type': 'node',
'node': node_name,
'node_name': node_name,
'node_kw': node_name
}
else:
for prop_name, prop in props.items():
# Skip relationship based properties
if "@relation" in obj[PROPERTIES][prop_name][PROP_TYPE]:
continue
if subtype == 'property':
yield {
'type': 'property',
'node': node_name,
'node_name': node_name,
'property': prop_name,
'property_name': prop_name,
'property_kw': prop_name,
'property_description': prop.get(DESCRIPTION, ''),
'property_required': prop.get(REQUIRED, False),
'property_type': PROP_ENUM if ENUM in prop else prop[PROP_TYPE]
}
elif subtype == 'value' and ENUM in prop:
for value in prop[ENUM]:
yield {
'type': 'value',
"node": node_name,
"node_name": node_name,
"property": prop_name,
"property_name": prop_name,
'property_description': prop.get(DESCRIPTION, ''),
'property_required': prop.get(REQUIRED, False),
'property_type': PROP_ENUM,
"value": value,
"value_kw": value
}
def index_data(self, index_name, object, id):
self.es_client.index(index_name, body=object, id=id)
def main():
parser = argparse.ArgumentParser(description='Load data from Neo4j to Elasticsearch')
parser.add_argument('indices_file',
type=argparse.FileType('r'),
help='Configuration file for indices, example is in config/es_indices.example.yml')
parser.add_argument('config_file',
type=argparse.FileType('r'),
help='Configuration file, example is in config/es_loader.example.yml')
args = parser.parse_args()
config = yaml.safe_load(args.config_file)['Config']
indices = yaml.safe_load(args.indices_file)['Indices']
print_config(logger, config)
neo4j_driver = GraphDatabase.driver(
config['neo4j_uri'],
auth=(config['neo4j_user'], config['neo4j_password']),
encrypted=False
)
loader = ESLoader(
es_host=config['es_host'],
neo4j_driver=neo4j_driver
)
load_model = False
if 'model_files' in config and config['model_files'] and 'prop_file' in config and config['prop_file']:
loader.read_model(config['model_files'], config['prop_file'])
load_model = True
summary = {}
for index in indices:
index_name = index.get('index_name')
summary[index_name] = "ERROR!"
logger.info(f'Begin loading index: "{index_name}"')
if 'type' not in index or index['type'] == 'neo4j':
cypher_queries = index.get('cypher_queries')
cypher_query = index.get('cypher_query')
if cypher_queries is None and cypher_query is not None:
cypher_queries = [{'query': cypher_query}]
try:
_validate_cypher_queries(cypher_queries)
summary[index_name] = loader.load(index_name, index['mapping'], cypher_queries)
except Exception as ex:
logger.error(f'There is an error in the "{index_name}" index definition, this index will not be loaded')
logger.error(ex)
elif index['type'] == 'about_file':
if 'about_file' in config:
loader.load_about_page(index_name, index['mapping'], config['about_file'])
summary[index_name] = "Loaded Successfully"
else:
logger.warning(f'"about_file" not set in configuration file, {index_name} will not be loaded!')
elif index['type'] == 'model':
if load_model and 'subtype' in index:
loader.load_model(index_name, index['mapping'], index['subtype'])
summary[index_name] = "Loaded Successfully"
else:
logger.warning(
f'"model_files" not set in configuration file, {index_name} will not be loaded!')
else:
logger.error(f'Unknown index type: "{index["type"]}"')
logger.info(f'Index loading summary:')
for index in summary.keys():
logger.info(f'{index}: {summary[index]}')
def _validate_cypher_queries(cypher_queries):
if type(cypher_queries) is not list:
raise Exception(f'The required property "cypher_queries" must be a list')
for i, cypher_query in enumerate(cypher_queries):
if type(cypher_query) is not dict:
raise Exception(f'Each entry in the "cypher_queries" list be a dict with a "query" property')
query = cypher_query.get('query')
if query is None:
raise Exception(f'The required property "query" is missing from a "cypher_queries" entry')
page_size = cypher_query.get('page_size')
if not _check_query_for_pagination(query):
logger.warning(f'Pagination parameters are missing from "cypher_queries" entry {i+1}, pagination will be disabled for this query')
cypher_query['page_size'] = 0
elif page_size is None:
logger.warning(
f'The page_size property is missing from "cypher_queries" entry {i+1}, pagination will be disabled for this query')
cypher_query['page_size'] = 0
def _check_query_for_pagination(query: str):
match = re.search('skip\s*\$skip\s*limit\s*\$limit', query, re.IGNORECASE)
return match is not None
if __name__ == '__main__':
main()