diff --git a/indico_toolkit/results/document.py b/indico_toolkit/results/document.py index 19da381..264070a 100644 --- a/indico_toolkit/results/document.py +++ b/indico_toolkit/results/document.py @@ -8,7 +8,6 @@ class Document: id: int name: str etl_output_url: str - full_text_url: str # Auto review changes must reproduce all model sections that were present in the # original result file. This may not be possible from the predictions alone--if a @@ -26,14 +25,12 @@ def from_v1_dict(result: object) -> "Document": document_results = get(result, dict, "results", "document", "results") model_names = frozenset(document_results.keys()) etl_output_url = get(result, str, "etl_output") - full_text_url = etl_output_url.replace("etl_output.json", "full_text.txt") return Document( # v1 result files don't include document IDs or filenames. id=None, # type: ignore[arg-type] name=None, # type: ignore[arg-type] etl_output_url=etl_output_url, - full_text_url=full_text_url, _model_sections=model_names, ) @@ -45,12 +42,10 @@ def from_v3_dict(document: object) -> "Document": model_results = get(document, dict, "model_results", "ORIGINAL") model_ids = frozenset(model_results.keys()) etl_output_url = get(document, str, "etl_output") - full_text_url = etl_output_url.replace("etl_output.json", "full_text.txt") return Document( id=get(document, int, "submissionfile_id"), name=get(document, str, "input_filename"), etl_output_url=etl_output_url, - full_text_url=full_text_url, _model_sections=model_ids, ) diff --git a/indico_toolkit/results/normalization.py b/indico_toolkit/results/normalization.py index 00dc6e2..6423add 100644 --- a/indico_toolkit/results/normalization.py +++ b/indico_toolkit/results/normalization.py @@ -37,6 +37,7 @@ def normalize_v1_result(result: "Any") -> None: for prediction in review if prediction is not None ) + for prediction in predictions: # Predictions added in review lack a `confidence` section. if "confidence" not in prediction: @@ -90,52 +91,53 @@ def normalize_v3_result(result: "Any") -> None: """ Fix inconsistencies observed in v3 result files. """ - for document in get(result, list, "submission_results"): - for review_results in get(document, dict, "model_results").values(): - predictions: "Any" = ( - prediction - for model_results in review_results.values() - for prediction in model_results - ) - for prediction in predictions: - # Predictions added in review lack a `confidence` section. - if "confidence" not in prediction: - prediction["confidence"] = {prediction["label"]: 0} - - # Document Extractions added in review may lack spans. - if ( - "text" in prediction - and "type" not in prediction - and "spans" not in prediction - ): - prediction["spans"] = [ - { - "page_num": prediction["page_num"], - "start": 0, - "end": 0, - } - ] - - # Form Extractions added in review may lack bounding boxes. - if "type" in prediction and "top" not in prediction: - prediction["top"] = 0 - prediction["left"] = 0 - prediction["right"] = 0 - prediction["bottom"] = 0 - - # Prior to 6.11, some Extractions lack a `normalized` section after - # review. - if "text" in prediction and "normalized" not in prediction: - prediction["normalized"] = {"formatted": prediction["text"]} - - # Document Extractions that didn't go through a linked labels - # transformer lack a `groupings` section. - if ( - "text" in prediction - and "type" not in prediction - and "groupings" not in prediction - ): - prediction["groupings"] = [] + predictions: "Any" = ( + prediction + for submission_result in get(result, list, "submission_results") + for model_result in get(submission_result, dict, "model_results").values() + for review_result in model_result.values() + for prediction in review_result + ) + + for prediction in predictions: + # Predictions added in review lack a `confidence` section. + if "confidence" not in prediction: + prediction["confidence"] = {prediction["label"]: 0} + + # Document Extractions added in review may lack spans. + if ( + "text" in prediction + and "type" not in prediction + and "spans" not in prediction + ): + prediction["spans"] = [ + { + "page_num": prediction["page_num"], + "start": 0, + "end": 0, + } + ] + + # Form Extractions added in review may lack bounding boxes. + if "type" in prediction and "top" not in prediction: + prediction["top"] = 0 + prediction["left"] = 0 + prediction["right"] = 0 + prediction["bottom"] = 0 + + # Prior to 6.11, some Extractions lack a `normalized` section after + # review. + if "text" in prediction and "normalized" not in prediction: + prediction["normalized"] = {"formatted": prediction["text"]} + + # Document Extractions that didn't go through a linked labels + # transformer lack a `groupings` section. + if ( + "text" in prediction + and "type" not in prediction + and "groupings" not in prediction + ): + prediction["groupings"] = [] # Prior to 6.8, v3 result files don't include a `reviews` section. if not has(result, dict, "reviews"): diff --git a/indico_toolkit/results/result.py b/indico_toolkit/results/result.py index 5d159bb..0d6384b 100644 --- a/indico_toolkit/results/result.py +++ b/indico_toolkit/results/result.py @@ -19,10 +19,10 @@ class Result: version: int submission_id: int - documents: "list[Document]" - models: "list[ModelGroup]" + documents: "tuple[Document, ...]" + models: "tuple[ModelGroup, ...]" predictions: "PredictionList[Prediction]" - reviews: "list[Review]" + reviews: "tuple[Review, ...]" @property def rejected(self) -> bool: @@ -88,10 +88,10 @@ def from_v1_dict(result: object) -> "Result": return Result( version=version, submission_id=submission_id, - documents=[document], - models=models, + documents=(document,), + models=tuple(models), predictions=predictions, - reviews=sorted(reviews), + reviews=tuple(sorted(reviews)), ) @staticmethod @@ -141,8 +141,8 @@ def from_v3_dict(result: object) -> "Result": return Result( version=version, submission_id=submission_id, - documents=documents, - models=models, + documents=tuple(documents), + models=tuple(models), predictions=predictions, - reviews=reviews, + reviews=tuple(reviews), ) diff --git a/tests/results/test_predictionlist.py b/tests/results/test_predictionlist.py index 8843094..7dd37e2 100644 --- a/tests/results/test_predictionlist.py +++ b/tests/results/test_predictionlist.py @@ -21,7 +21,6 @@ def document() -> Document: id=2922, name="1040_filled.tiff", etl_output_url="indico-file:///storage/submission/2922/etl_output.json", - full_text_url="indico-file:///storage/submission/2922/full_text.txt", _model_sections=frozenset({"124", "123", "122", "121"}), )