Skip to content

Commit 09141d9

Browse files
Notification for embedder warnings (#5)
* adds notification for embedder warnings * adds embedders directory to gitignore * prints last embedder warning in notification * sends embedding warning as notification of type warning * adds primary key or record_id to embedding warning * fixes spelling * updates embedders version number Co-authored-by: JWittmeyer <[email protected]>
1 parent 5eb70a0 commit 09141d9

File tree

5 files changed

+100
-28
lines changed

5 files changed

+100
-28
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,4 +129,7 @@ dmypy.json
129129

130130
# Pyre type checker
131131
.pyre/
132-
.DS_Store
132+
.DS_Store
133+
134+
# embedders package
135+
embedders/

controller.py

Lines changed: 79 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
record,
99
tokenization,
1010
notification,
11+
organization,
1112
)
1213
import torch
1314
import traceback
@@ -24,10 +25,9 @@
2425
from util import daemon, request_util
2526
from util.decorator import param_throttle
2627
from util.embedders import get_embedder
27-
from util.notification import send_project_update
28+
from util.notification import send_project_update, embedding_warning_templates
2829
import os
2930
import pandas as pd
30-
from submodules.model.business_objects import embedding, general, organization
3131
from submodules.s3 import controller as s3
3232

3333
logging.basicConfig(level=logging.INFO)
@@ -133,7 +133,7 @@ def prepare_run_encoding(request: data_type.Request, embedding_type: str) -> int
133133
request.project_id,
134134
request.user_id,
135135
message,
136-
"ERROR",
136+
enums.Notification.ERROR.value,
137137
enums.NotificationType.EMBEDDING_CREATION_FAILED.value,
138138
True,
139139
)
@@ -142,11 +142,6 @@ def prepare_run_encoding(request: data_type.Request, embedding_type: str) -> int
142142
f"notification_created:{request.user_id}",
143143
True,
144144
)
145-
embedding.update_embedding_state_failed(
146-
request.project_id,
147-
embedding_id,
148-
with_commit=True,
149-
)
150145
doc_ock.post_embedding_failed(
151146
request.user_id, request.config_string
152147
)
@@ -175,7 +170,7 @@ def run_encoding(
175170
request.project_id,
176171
request.user_id,
177172
f"Initializing model {request.config_string}. This can take a few minutes.",
178-
"INFO",
173+
enums.Notification.INFO.value,
179174
enums.NotificationType.EMBEDDING_CREATION_STARTED.value,
180175
True,
181176
)
@@ -184,7 +179,9 @@ def run_encoding(
184179
)
185180
iso2_code = project.get_blank_tokenizer_from_project(request.project_id)
186181
try:
187-
embedder = get_embedder(request.project_id, embedding_type, request.config_string, iso2_code)
182+
embedder = get_embedder(
183+
request.project_id, embedding_type, request.config_string, iso2_code
184+
)
188185
except OSError:
189186
embedding.update_embedding_state_failed(
190187
request.project_id,
@@ -201,7 +198,7 @@ def run_encoding(
201198
request.project_id,
202199
request.user_id,
203200
message,
204-
"ERROR",
201+
enums.Notification.ERROR.value,
205202
enums.NotificationType.EMBEDDING_CREATION_FAILED.value,
206203
True,
207204
)
@@ -226,7 +223,7 @@ def run_encoding(
226223
request.project_id,
227224
request.user_id,
228225
message,
229-
"ERROR",
226+
enums.Notification.ERROR.value,
230227
enums.NotificationType.EMBEDDING_CREATION_FAILED.value,
231228
True,
232229
)
@@ -240,7 +237,7 @@ def run_encoding(
240237
request.project_id,
241238
request.user_id,
242239
f"Could not load model {request.config_string}. Please contact the support.",
243-
"ERROR",
240+
enums.Notification.ERROR.value,
244241
enums.NotificationType.EMBEDDING_CREATION_FAILED.value,
245242
True,
246243
)
@@ -283,7 +280,7 @@ def run_encoding(
283280
request.project_id,
284281
request.user_id,
285282
f"Started encoding {attribute_name} using model {request.config_string}.",
286-
"INFO",
283+
enums.Notification.INFO.value,
287284
enums.NotificationType.EMBEDDING_CREATION_STARTED.value,
288285
True,
289286
)
@@ -324,6 +321,39 @@ def run_encoding(
324321
initial_count,
325322
)
326323
except Exception:
324+
for warning_type, idx_list in embedder.get_warnings().items():
325+
# use last record with warning as example
326+
example_record_id = record_ids[idx_list[-1]]
327+
328+
primary_keys = [
329+
pk.name for pk in attribute.get_primary_keys(request.project_id)
330+
]
331+
if primary_keys:
332+
example_record_data = record.get(
333+
request.project_id, example_record_id
334+
).data
335+
example_record_msg = "with primary key: " + ", ".join(
336+
[str(example_record_data[p_key]) for p_key in primary_keys]
337+
)
338+
else:
339+
example_record_msg = " with record id: " + str(example_record_id)
340+
341+
warning_msg = embedding_warning_templates[warning_type].format(
342+
record_number=len(idx_list), example_record_msg=example_record_msg
343+
)
344+
345+
notification.create(
346+
request.project_id,
347+
request.user_id,
348+
warning_msg,
349+
enums.Notification.WARNING.value,
350+
enums.NotificationType.EMBEDDING_CREATION_WARNING.value,
351+
True,
352+
)
353+
send_project_update(
354+
request.project_id, f"notification_created:{request.user_id}", True
355+
)
356+
327357
embedding.update_embedding_state_failed(
328358
request.project_id,
329359
embedding_id,
@@ -337,27 +367,51 @@ def run_encoding(
337367
request.project_id,
338368
request.user_id,
339369
"Error at runtime. Please contact support.",
340-
"ERROR",
370+
enums.Notification.ERROR.value,
341371
enums.NotificationType.EMBEDDING_CREATION_FAILED.value,
342372
True,
343373
)
344374
send_project_update(
345375
request.project_id, f"notification_created:{request.user_id}", True
346376
)
347377
print(traceback.format_exc(), flush=True)
348-
embedding.update_embedding_state_failed(
349-
request.project_id,
350-
embedding_id,
351-
with_commit=True,
352-
)
353-
send_project_update(
354-
request.project_id,
355-
f"embedding:{embedding_id}:state:{enums.EmbeddingState.FAILED.value}",
356-
)
357378
doc_ock.post_embedding_failed(request.user_id, request.config_string)
358379
return 500
359380

360381
if embedding.get(request.project_id, embedding_id):
382+
for warning_type, idx_list in embedder.get_warnings().items():
383+
# use last record with warning as example
384+
example_record_id = record_ids[idx_list[-1]]
385+
386+
primary_keys = [
387+
pk.name for pk in attribute.get_primary_keys(request.project_id)
388+
]
389+
if primary_keys:
390+
example_record_data = record.get(
391+
request.project_id, example_record_id
392+
).data
393+
example_record_msg = "with primary key: " + ", ".join(
394+
[str(example_record_data[p_key]) for p_key in primary_keys]
395+
)
396+
else:
397+
example_record_msg = " with record id: " + str(example_record_id)
398+
399+
warning_msg = embedding_warning_templates[warning_type].format(
400+
record_number=len(idx_list), example_record_msg=example_record_msg
401+
)
402+
403+
notification.create(
404+
request.project_id,
405+
request.user_id,
406+
warning_msg,
407+
enums.Notification.WARNING.value,
408+
enums.NotificationType.EMBEDDING_CREATION_WARNING.value,
409+
True,
410+
)
411+
send_project_update(
412+
request.project_id, f"notification_created:{request.user_id}", True
413+
)
414+
361415
if embedding_type == "classification":
362416
request_util.post_embedding_to_neural_search(
363417
request.project_id, embedding_id
@@ -376,7 +430,7 @@ def run_encoding(
376430
request.project_id,
377431
request.user_id,
378432
f"Finished encoding {attribute_name} using model {request.config_string}.",
379-
"SUCCESS",
433+
enums.Notification.SUCCESS.value,
380434
enums.NotificationType.EMBEDDING_CREATION_DONE.value,
381435
True,
382436
)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ certifi==2021.10.8
77
charset-normalizer==2.0.12
88
click==8.0.4
99
cymem==2.0.6
10-
embedders==0.0.14
10+
embedders==0.0.15
1111
fastapi==0.78.0
1212
greenlet==1.1.2
1313
h11==0.13.0

util/notification.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,23 @@
11
import requests
22
import os
33

4+
from embedders.enums import WarningType
5+
46
from submodules.model.business_objects import project
57

8+
embedding_warning_templates = {
9+
WarningType.DOCUMENT_IS_SPLITTED.value: (
10+
"For {record_number} records, the text length exceeds the model's max input"
11+
" length. For these records, the texts are splitted and the parts are processed"
12+
" individually. For example, record {example_record_msg}."
13+
),
14+
WarningType.TOKEN_MISMATCHING.value: (
15+
"For {record_number} records, the number of embeddings does not match the "
16+
"number of spacy tokens. Please contact support. For example, record "
17+
"{example_record_msg}."
18+
),
19+
}
20+
621

722
def send_project_update(project_id: str, message: str, is_global: bool = False) -> None:
823
endpoint = os.getenv("WS_NOTIFY_ENDPOINT")

0 commit comments

Comments
 (0)