From 263174aa07de8441ee4f3390363168984555d11f Mon Sep 17 00:00:00 2001 From: Attila Papai <97034214+attila-papai@users.noreply.github.com> Date: Wed, 16 Aug 2023 21:18:38 +0200 Subject: [PATCH 01/20] [AL-6743] skip invalid media type validation when ADV is enabled (#1210) --- tests/integration/test_data_rows.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py index 870b0cf41..8248304b3 100644 --- a/tests/integration/test_data_rows.py +++ b/tests/integration/test_data_rows.py @@ -980,7 +980,7 @@ def test_create_conversational_text(dataset, conversational_content): data_row.row_data).json() == conversational_content['row_data'] -def test_invalid_media_type(dataset, conversational_content): +def test_invalid_media_type(dataset, conversational_content, is_adv_enabled): for error_message, invalid_media_type in [[ "Found invalid contents for media type: 'IMAGE'", 'IMAGE' ], ["Found invalid media type: 'totallyinvalid'", 'totallyinvalid']]: @@ -988,9 +988,13 @@ def test_invalid_media_type(dataset, conversational_content): # using malformed query. But for invalid contents in FileUploads we use InvalidQueryError with pytest.raises(labelbox.exceptions.InvalidQueryError): dataset.create_data_rows_sync([{ - **conversational_content, 'media_type': invalid_media_type + **conversational_content, 'media_type': 'IMAGE' }]) + if is_adv_enabled: + # ADV does not take media type hint into account for async import requests + continue + task = dataset.create_data_rows([{ **conversational_content, 'media_type': invalid_media_type }]) From 454950a51ffebd9a2595fe4d5b4992e3cd9a4ce5 Mon Sep 17 00:00:00 2001 From: Richard Sun Date: Thu, 17 Aug 2023 11:18:09 -0700 Subject: [PATCH 02/20] [QQC-2355] Provide methods to create multiple batches (#1197) --- labelbox/client.py | 24 ++++ labelbox/schema/create_batches_task.py | 67 +++++++++ labelbox/schema/project.py | 190 +++++++++++++++++++++---- tests/integration/conftest.py | 45 +++++- tests/integration/test_batch.py | 55 ++----- tests/integration/test_batches.py | 36 +++++ 6 files changed, 344 insertions(+), 73 deletions(-) create mode 100644 labelbox/schema/create_batches_task.py create mode 100644 tests/integration/test_batches.py diff --git a/labelbox/client.py b/labelbox/client.py index ce1ebe33c..c28f3fb78 100644 --- a/labelbox/client.py +++ b/labelbox/client.py @@ -1704,3 +1704,27 @@ def unarchive_feature_schema_node(self, ontology_id: str, raise labelbox.exceptions.LabelboxError( "Failed unarchive the feature schema node, message: ", response.text) + + def get_batch(self, project_id: str, batch_id: str) -> Entity.Batch: + # obtain batch entity to return + get_batch_str = """query %s($projectId: ID!, $batchId: ID!) { + project(where: {id: $projectId}) { + batches(where: {id: $batchId}) { + nodes { + %s + } + } + } + } + """ % ("getProjectBatchPyApi", + query.results_query_part(Entity.Batch)) + + batch = self.execute( + get_batch_str, { + "projectId": project_id, + "batchId": batch_id + }, + timeout=180.0, + experimental=True)["project"]["batches"]["nodes"][0] + + return Entity.Batch(self, project_id, batch) diff --git a/labelbox/schema/create_batches_task.py b/labelbox/schema/create_batches_task.py new file mode 100644 index 000000000..919d30204 --- /dev/null +++ b/labelbox/schema/create_batches_task.py @@ -0,0 +1,67 @@ +import json +from typing import TYPE_CHECKING, Callable, List, Optional, Dict, Any + +from labelbox.orm.model import Entity + +if TYPE_CHECKING: + from labelbox import User + + def lru_cache() -> Callable[..., Callable[..., Dict[str, Any]]]: + pass +else: + from functools import lru_cache + + +class CreateBatchesTask: + + def __init__(self, client, project_id: str, batch_ids: List[str], + task_ids: List[str]): + self.client = client + self.project_id = project_id + self.batches = batch_ids + self.tasks = [ + Entity.Task.get_task(self.client, task_id) for task_id in task_ids + ] + + def wait_till_done(self, timeout_seconds: int = 300) -> None: + """ + Waits for the task to complete. + + Args: + timeout_seconds: the number of seconds to wait before timing out + + Returns: None + """ + + for task in self.tasks: + task.wait_till_done(timeout_seconds) + + def errors(self) -> Optional[Dict[str, Any]]: + """ + Returns the errors from the task, if any. + + Returns: a dictionary of errors, keyed by task id + """ + + errors = {} + for task in self.tasks: + if task.status == "FAILED": + errors[task.uid] = json.loads(task.result_url) + + if len(errors) == 0: + return None + + return errors + + @lru_cache() + def result(self): + """ + Returns the batches created by the task. + + Returns: the list of batches created by the task + """ + + return [ + self.client.get_batch(self.project_id, batch_id) + for batch_id in self.batches + ] diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index b5f4fdb8d..addb8c10b 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -7,9 +7,9 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union from urllib.parse import urlparse -from labelbox import parser import requests +from labelbox import parser from labelbox import utils from labelbox.exceptions import (InvalidQueryError, LabelboxError, ProcessingWaitTimeout, ResourceConflict, @@ -19,6 +19,7 @@ from labelbox.orm.model import Entity, Field, Relationship from labelbox.pagination import PaginatedCollection from labelbox.schema.consensus_settings import ConsensusSettings +from labelbox.schema.create_batches_task import CreateBatchesTask from labelbox.schema.data_row import DataRow from labelbox.schema.export_filters import ProjectExportFilters, validate_datetime, build_filters from labelbox.schema.export_params import ProjectExportParams @@ -26,8 +27,8 @@ from labelbox.schema.queue_mode import QueueMode from labelbox.schema.resource_tag import ResourceTag from labelbox.schema.task import Task -from labelbox.schema.user import User from labelbox.schema.task_queue import TaskQueue +from labelbox.schema.user import User if TYPE_CHECKING: from labelbox import BulkImportRequest @@ -721,14 +722,19 @@ def create_batch( consensus_settings: Optional[Dict[str, float]] = None, global_keys: Optional[List[str]] = None, ): - """Create a new batch for a project. One of `global_keys` or `data_rows` must be provided but not both. + """ + Creates a new batch for a project. One of `global_keys` or `data_rows` must be provided, but not both. A + maximum of 100,000 data rows can be added to a batch. Args: name: a name for the batch, must be unique within a project data_rows: Either a list of `DataRows` or Data Row ids. global_keys: global keys for data rows to add to the batch. priority: An optional priority for the Data Rows in the Batch. 1 highest -> 5 lowest - consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3, 'coverage_percentage': 0.1} + consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3, + 'coverage_percentage': 0.1} + + Returns: the created batch """ # @TODO: make this automatic? @@ -773,6 +779,156 @@ def create_batch( return self._create_batch_sync(name, dr_ids, global_keys, priority, consensus_settings) + def create_batches( + self, + name_prefix: str, + data_rows: Optional[List[Union[str, DataRow]]] = None, + global_keys: Optional[List[str]] = None, + priority: int = 5, + consensus_settings: Optional[Dict[str, float]] = None, + ) -> CreateBatchesTask: + """ + Creates batches for a project from a list of data rows. One of `global_keys` or `data_rows` must be provided, + but not both. When more than 100k data rows are specified and thus multiple batches are needed, the specific + batch that each data row will be placed in is undefined. + + Batches will be created with the specified name prefix and a unique suffix. The suffix will be a 4-digit + number starting at 0000. For example, if the name prefix is "batch" and 3 batches are created, the names + will be "batch0000", "batch0001", and "batch0002". This method will throw an error if a batch with the same + name already exists. + + Args: + name_prefix: a prefix for the batch names, must be unique within a project + data_rows: Either a list of `DataRows` or Data Row ids. + global_keys: global keys for data rows to add to the batch. + priority: An optional priority for the Data Rows in the Batch. 1 highest -> 5 lowest + consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3, + 'coverage_percentage': 0.1} + + Returns: a task for the created batches + """ + + if self.queue_mode != QueueMode.Batch: + raise ValueError("Project must be in batch mode") + + dr_ids = [] + if data_rows is not None: + for dr in data_rows: + if isinstance(dr, Entity.DataRow): + dr_ids.append(dr.uid) + elif isinstance(dr, str): + dr_ids.append(dr) + else: + raise ValueError( + "`data_rows` must be DataRow ids or DataRow objects") + + self._wait_until_data_rows_are_processed( + dr_ids, global_keys, self._wait_processing_max_seconds) + + if consensus_settings: + consensus_settings = ConsensusSettings(**consensus_settings).dict( + by_alias=True) + + method = 'createBatches' + mutation_str = """mutation %sPyApi($projectId: ID!, $input: CreateBatchesInput!) { + project(where: {id: $projectId}) { + %s(input: $input) { + tasks { + batchUuid + taskId + } + } + } + } + """ % (method, method) + + params = { + "projectId": self.uid, + "input": { + "batchNamePrefix": name_prefix, + "dataRowIds": dr_ids, + "globalKeys": global_keys, + "priority": priority, + "consensusSettings": consensus_settings + } + } + + tasks = self.client.execute( + mutation_str, params, experimental=True)["project"][method]["tasks"] + batch_ids = [task["batchUuid"] for task in tasks] + task_ids = [task["taskId"] for task in tasks] + + return CreateBatchesTask(self.client, self.uid, batch_ids, task_ids) + + def create_batches_from_dataset( + self, + name_prefix: str, + dataset_id: str, + priority: int = 5, + consensus_settings: Optional[Dict[str, + float]] = None) -> CreateBatchesTask: + """ + Creates batches for a project from a dataset, selecting only the data rows that are not already added to the + project. When the dataset contains more than 100k data rows and multiple batches are needed, the specific batch + that each data row will be placed in is undefined. Note that data rows may not be immediately available for a + project after being added to a dataset; use the `_wait_until_data_rows_are_processed` method to ensure that + data rows are available before creating batches. + + Batches will be created with the specified name prefix and a unique suffix. The suffix will be a 4-digit + number starting at 0000. For example, if the name prefix is "batch" and 3 batches are created, the names + will be "batch0000", "batch0001", and "batch0002". This method will throw an error if a batch with the same + name already exists. + + Args: + name_prefix: a prefix for the batch names, must be unique within a project + dataset_id: the id of the dataset to create batches from + priority: An optional priority for the Data Rows in the Batch. 1 highest -> 5 lowest + consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3, + 'coverage_percentage': 0.1} + + Returns: a task for the created batches + """ + + if self.queue_mode != QueueMode.Batch: + raise ValueError("Project must be in batch mode") + + if consensus_settings: + consensus_settings = ConsensusSettings(**consensus_settings).dict( + by_alias=True) + + print("Creating batches from dataset %s", dataset_id) + + method = 'createBatchesFromDataset' + mutation_str = """mutation %sPyApi($projectId: ID!, $input: CreateBatchesFromDatasetInput!) { + project(where: {id: $projectId}) { + %s(input: $input) { + tasks { + batchUuid + taskId + } + } + } + } + """ % (method, method) + + params = { + "projectId": self.uid, + "input": { + "batchNamePrefix": name_prefix, + "datasetId": dataset_id, + "priority": priority, + "consensusSettings": consensus_settings + } + } + + tasks = self.client.execute( + mutation_str, params, experimental=True)["project"][method]["tasks"] + + batch_ids = [task["batchUuid"] for task in tasks] + task_ids = [task["taskId"] for task in tasks] + + return CreateBatchesTask(self.client, self.uid, batch_ids, task_ids) + def _create_batch_sync(self, name, dr_ids, global_keys, priority, consensus_settings): method = 'createBatchV2' @@ -843,7 +999,7 @@ def _create_batch_async(self, add_data_rows_mutation_str = """mutation %sPyApi($projectId: ID!, $input: AddDataRowsToBatchInput!) { project(where: {id: $projectId}) { %s(input: $input) { - taskId + taskId } } } @@ -871,29 +1027,7 @@ def _create_batch_async(self, raise LabelboxError(f"Batch was not created successfully: " + json.dumps(task.errors)) - # obtain batch entity to return - get_batch_str = """query %s($projectId: ID!, $batchId: ID!) { - project(where: {id: $projectId}) { - batches(where: {id: $batchId}) { - nodes { - %s - } - } - } - } - """ % ("getProjectBatchPyApi", - query.results_query_part(Entity.Batch)) - - batch = self.client.execute( - get_batch_str, { - "projectId": self.uid, - "batchId": batch_id - }, - timeout=180.0, - experimental=True)["project"]["batches"]["nodes"][0] - - # TODO async endpoints currently do not provide failed_data_row_ids in response - return Entity.Batch(self.client, self.uid, batch) + return self.client.get_batch(self.uid, batch_id) def _update_queue_mode(self, mode: "QueueMode") -> "QueueMode": """ diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 75474ab4d..ed4229b4d 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,13 +5,13 @@ import uuid from enum import Enum from types import SimpleNamespace -from typing import Type +from typing import Type, List import pytest import requests -from labelbox import Client, MediaType -from labelbox import LabelingFrontend, Dataset +from labelbox import Client, Dataset +from labelbox import LabelingFrontend from labelbox import OntologyBuilder, Tool, Option, Classification, MediaType from labelbox.orm import query from labelbox.pagination import PaginatedCollection @@ -768,3 +768,42 @@ def is_adv_enabled(client) -> bool: query_str = "query IsAdvEnabledPyApi { user { isAdvEnabled } }" response = client.execute(query_str) return bool(response['user']['isAdvEnabled']) + + +IMAGE_URL = "https://storage.googleapis.com/diagnostics-demo-data/coco/COCO_train2014_000000000034.jpg" +EXTERNAL_ID = "my-image" + + +@pytest.fixture +def big_dataset(dataset: Dataset): + task = dataset.create_data_rows([ + { + "row_data": IMAGE_URL, + "external_id": EXTERNAL_ID + }, + ] * 3) + task.wait_till_done() + + yield dataset + + +@pytest.fixture +def big_dataset_data_row_ids(big_dataset: Dataset) -> List[str]: + yield [dr.uid for dr in list(big_dataset.export_data_rows())] + + +@pytest.fixture(scope='function') +def dataset_with_invalid_data_rows(unique_dataset: Dataset): + upload_invalid_data_rows_for_dataset(unique_dataset) + + yield unique_dataset + + +def upload_invalid_data_rows_for_dataset(dataset: Dataset): + task = dataset.create_data_rows([ + { + "row_data": 'gs://invalid-bucket/example.png', # forbidden + "external_id": "image-without-access.jpg" + }, + ] * 2) + task.wait_till_done() diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index 485bf308c..40eb632ef 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -1,53 +1,22 @@ import time +from typing import List +from uuid import uuid4 + import pytest -from uuid import uuid4 from labelbox import Dataset, Project from labelbox.exceptions import ProcessingWaitTimeout, MalformedQueryException, ResourceConflict, LabelboxError - -IMAGE_URL = "https://storage.googleapis.com/diagnostics-demo-data/coco/COCO_train2014_000000000034.jpg" -EXTERNAL_ID = "my-image" +from integration.conftest import upload_invalid_data_rows_for_dataset, IMAGE_URL, EXTERNAL_ID def get_data_row_ids(ds: Dataset): return [dr.uid for dr in list(ds.export_data_rows())] -@pytest.fixture -def big_dataset(dataset: Dataset): - task = dataset.create_data_rows([ - { - "row_data": IMAGE_URL, - "external_id": EXTERNAL_ID - }, - ] * 3) - task.wait_till_done() - - yield dataset - - -@pytest.fixture(scope='function') -def dataset_with_invalid_data_rows(unique_dataset: Dataset): - upload_invalid_data_rows_for_dataset(unique_dataset) - - yield unique_dataset - - -def upload_invalid_data_rows_for_dataset(dataset: Dataset): - task = dataset.create_data_rows([ - { - "row_data": 'gs://invalid-bucket/example.png', # forbidden - "external_id": "image-without-access.jpg" - }, - ] * 2) - task.wait_till_done() - - -def test_create_batch(project: Project, big_dataset: Dataset): - data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())] - batch = project.create_batch("test-batch", data_rows, 3) +def test_create_batch(project: Project, big_dataset_data_row_ids: List[str]): + batch = project.create_batch("test-batch", big_dataset_data_row_ids, 3) assert batch.name == "test-batch" - assert batch.size == len(data_rows) + assert batch.size == len(big_dataset_data_row_ids) def test_create_batch_with_invalid_data_rows_ids(project: Project): @@ -125,11 +94,13 @@ def test_create_batch_with_float_number_priority(project: Project, priority=4.9) -def test_create_batch_async(project: Project, big_dataset: Dataset): - data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())] - batch = project._create_batch_async("big-batch", data_rows, priority=3) +def test_create_batch_async(project: Project, + big_dataset_data_row_ids: List[str]): + batch = project._create_batch_async("big-batch", + big_dataset_data_row_ids, + priority=3) assert batch.name == "big-batch" - assert batch.size == len(data_rows) + assert batch.size == len(big_dataset_data_row_ids) def test_create_batch_with_consensus_settings(project: Project, diff --git a/tests/integration/test_batches.py b/tests/integration/test_batches.py new file mode 100644 index 000000000..12a4a4355 --- /dev/null +++ b/tests/integration/test_batches.py @@ -0,0 +1,36 @@ +from typing import List + +import pytest + +from labelbox import Project, Dataset + + +def test_create_batches(project: Project, big_dataset_data_row_ids: List[str]): + task = project.create_batches("test-batch", + big_dataset_data_row_ids, + priority=3) + + task.wait_till_done() + assert task.errors() is None + batches = task.result() + + assert len(batches) == 1 + assert batches[0].name == "test-batch0000" + assert batches[0].size == len(big_dataset_data_row_ids) + + +def test_create_batches_from_dataset(project: Project, big_dataset: Dataset): + data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())] + project._wait_until_data_rows_are_processed(data_rows, [], 300) + + task = project.create_batches_from_dataset("test-batch", + big_dataset.uid, + priority=3) + + task.wait_till_done() + assert task.errors() is None + batches = task.result() + + assert len(batches) == 1 + assert batches[0].name == "test-batch0000" + assert batches[0].size == len(data_rows) From 20ebe5f4165aacb4166febd73e82c3837ef33d12 Mon Sep 17 00:00:00 2001 From: Attila Papai <97034214+attila-papai@users.noreply.github.com> Date: Thu, 17 Aug 2023 21:21:48 +0200 Subject: [PATCH 03/20] [AL-6729] assert adv task errors separately (#1214) --- tests/integration/test_task.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/integration/test_task.py b/tests/integration/test_task.py index d591b16a9..b035b09ed 100644 --- a/tests/integration/test_task.py +++ b/tests/integration/test_task.py @@ -8,7 +8,7 @@ TEXT_SCHEMA_ID = "cko8s9r5v0001h2dk9elqdidh" -def test_task_errors(dataset, image_url, snapshot): +def test_task_errors(dataset, image_url, snapshot, is_adv_enabled): client = dataset.client task = dataset.create_data_rows([ { @@ -25,16 +25,22 @@ def test_task_errors(dataset, image_url, snapshot): assert task in client.get_user().created_tasks() task.wait_till_done() - # assert task.status == "FAILED" - # assert len(task.failed_data_rows) > 0 - snapshot.snapshot_dir = INTEGRATION_SNAPSHOT_DIRECTORY - # RowData is dynamic, so we need to remove it from the snapshot - task.failed_data_rows[0]['failedDataRows'][0]['rowData'] = '' - snapshot.assert_match(json.dumps(task.failed_data_rows), - 'test_task.test_task_errors.failed_data_rows.json') - assert task.errors is not None - snapshot.assert_match(json.dumps(task.errors), - 'test_task.test_task_errors.errors.json') + if is_adv_enabled: + assert len(task.failed_data_rows) == 1 + assert "A schemaId can only be specified once per DataRow : [cko8s9r5v0001h2dk9elqdidh]" in task.failed_data_rows[ + 0]['message'] + assert len( + task.failed_data_rows[0]['failedDataRows'][0]['metadata']) == 2 + else: + snapshot.snapshot_dir = INTEGRATION_SNAPSHOT_DIRECTORY + # RowData is dynamic, so we need to remove it from the snapshot + task.failed_data_rows[0]['failedDataRows'][0]['rowData'] = '' + snapshot.assert_match( + json.dumps(task.failed_data_rows), + 'test_task.test_task_errors.failed_data_rows.json') + assert task.errors is not None + snapshot.assert_match(json.dumps(task.errors), + 'test_task.test_task_errors.errors.json') def test_task_success_json(dataset, image_url, snapshot): From 20710d6fb4ce2b2f1d1fc4e55af3879ab6316e5c Mon Sep 17 00:00:00 2001 From: Attila Papai <97034214+attila-papai@users.noreply.github.com> Date: Fri, 18 Aug 2023 09:41:25 +0200 Subject: [PATCH 04/20] [AL-6740] assert adv global key errors separately (#1215) --- tests/integration/test_global_keys.py | 37 +++++++++++++++++---------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/tests/integration/test_global_keys.py b/tests/integration/test_global_keys.py index 25ca7ba33..6b9588c18 100644 --- a/tests/integration/test_global_keys.py +++ b/tests/integration/test_global_keys.py @@ -116,7 +116,8 @@ def test_long_global_key_validation(client, dataset, image_url): 'error'] == 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid' -def test_global_key_with_whitespaces_validation(client, dataset, image_url): +def test_global_key_with_whitespaces_validation(client, dataset, image_url, + is_adv_enabled): dr_1 = dataset.create_data_row(row_data=image_url) dr_2 = dataset.create_data_row(row_data=image_url) dr_3 = dataset.create_data_row(row_data=image_url) @@ -137,19 +138,27 @@ def test_global_key_with_whitespaces_validation(client, dataset, image_url): }] res = client.assign_global_keys_to_data_rows(assignment_inputs) - assert len(res['results']) == 0 - assert len(res['errors']) == 3 - assert res['status'] == 'FAILURE' - assign_errors_ids = set([e['data_row_id'] for e in res['errors']]) - assign_errors_gks = set([e['global_key'] for e in res['errors']]) - assign_errors_msgs = set([e['error'] for e in res['errors']]) - assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid]) - assert assign_errors_gks == set([gk_1, gk_2, gk_3]) - assert assign_errors_msgs == set([ - 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', - 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', - 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid' - ]) + if is_adv_enabled: + assert res['status'] == 'PARTIAL SUCCESS' + assert len(res['results']) == 2 + assert len(res['errors']) == 1 + assert res['errors'][0]['global_key'] == gk_3 + assert res['errors'][0][ + 'error'] == "Invalid assignment. Either DataRow does not exist, or globalKey is invalid" + else: + assert len(res['results']) == 0 + assert len(res['errors']) == 3 + assert res['status'] == 'FAILURE' + assign_errors_ids = set([e['data_row_id'] for e in res['errors']]) + assign_errors_gks = set([e['global_key'] for e in res['errors']]) + assign_errors_msgs = set([e['error'] for e in res['errors']]) + assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid]) + assert assign_errors_gks == set([gk_1, gk_2, gk_3]) + assert assign_errors_msgs == set([ + 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', + 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', + 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid' + ]) def test_get_data_row_ids_for_global_keys(client, dataset, image_url): From b439780d1a8a164e3b1c143c3cd86e910bfd95dd Mon Sep 17 00:00:00 2001 From: Attila Papai <97034214+attila-papai@users.noreply.github.com> Date: Fri, 18 Aug 2023 17:37:57 +0200 Subject: [PATCH 05/20] Remove sleep between creating and exporting a batch (#1213) --- tests/integration/export_v2/test_legacy_export.py | 1 - tests/integration/test_batch.py | 2 -- tests/integration/test_project.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/tests/integration/export_v2/test_legacy_export.py b/tests/integration/export_v2/test_legacy_export.py index 071e8254d..e4e1e595c 100644 --- a/tests/integration/export_v2/test_legacy_export.py +++ b/tests/integration/export_v2/test_legacy_export.py @@ -171,7 +171,6 @@ def test_export_data_rows(project: Project, dataset: Dataset): data_rows = [dr.uid for dr in list(dataset.export_data_rows())] batch = project.create_batch("batch test", data_rows) - result = list(batch.export_data_rows()) exported_data_rows = [dr.uid for dr in result] diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index 40eb632ef..600762817 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -211,8 +211,6 @@ def test_export_data_rows(project: Project, dataset: Dataset): data_rows = [dr.uid for dr in list(dataset.export_data_rows())] batch = project.create_batch("batch test", data_rows) - # allow time for catapult to sync changes to ES - time.sleep(5) result = list(batch.export_data_rows()) exported_data_rows = [dr.uid for dr in result] diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py index 20b36533a..b9467e0dd 100644 --- a/tests/integration/test_project.py +++ b/tests/integration/test_project.py @@ -228,8 +228,6 @@ def test_create_batch_with_global_keys_sync(project: Project, data_rows): global_keys = [dr.global_key for dr in data_rows] batch_name = f'batch {uuid.uuid4()}' batch = project.create_batch(batch_name, global_keys=global_keys) - # allow time for catapult to sync changes to ES - time.sleep(5) # TODO: Move to export_v2 batch_data_rows = set(batch.export_data_rows()) assert batch_data_rows == set(data_rows) From be939096cad7ed1e45ce1c0cb54823245bfdb28b Mon Sep 17 00:00:00 2001 From: Attila Papai <97034214+attila-papai@users.noreply.github.com> Date: Fri, 18 Aug 2023 17:38:57 +0200 Subject: [PATCH 06/20] Improve ADV assertion for bulk sync import (#1212) --- tests/integration/test_data_rows.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py index 8248304b3..6a89bf9d4 100644 --- a/tests/integration/test_data_rows.py +++ b/tests/integration/test_data_rows.py @@ -929,9 +929,7 @@ def test_data_row_bulk_creation_sync_with_same_global_keys( dataset, sample_image, is_adv_enabled): global_key_1 = str(uuid.uuid4()) - if is_adv_enabled: - # ADV does not throw an error for duplicate global keys - # but rather create the first one and reject the second + with pytest.raises(labelbox.exceptions.MalformedQueryException) as exc_info: dataset.create_data_rows_sync([{ DataRow.row_data: sample_image, DataRow.global_key: global_key_1 @@ -939,18 +937,13 @@ def test_data_row_bulk_creation_sync_with_same_global_keys( DataRow.row_data: sample_image, DataRow.global_key: global_key_1 }]) + + if is_adv_enabled: + # ADV will import the first data row but not the second (duplicate global key) assert len(list(dataset.data_rows())) == 1 - assert list(dataset.data_rows())[0].global_key == global_key_1 + assert "Some data rows were not imported. Check error output here" in str( + exc_info.value) else: - with pytest.raises(labelbox.exceptions.MalformedQueryException): - dataset.create_data_rows_sync([{ - DataRow.row_data: sample_image, - DataRow.global_key: global_key_1 - }, { - DataRow.row_data: sample_image, - DataRow.global_key: global_key_1 - }]) - assert len(list(dataset.data_rows())) == 0 dataset.create_data_rows_sync([{ From 38c5776f65670a0cfac9a11667a87ab6b2ae1039 Mon Sep 17 00:00:00 2001 From: Attila Papai <97034214+attila-papai@users.noreply.github.com> Date: Mon, 21 Aug 2023 15:33:13 +0200 Subject: [PATCH 07/20] [AL-6929] adjust ADV specific asserts (#1216) --- tests/integration/test_data_rows.py | 1 + tests/integration/test_global_keys.py | 37 ++++++++++----------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py index 6a89bf9d4..fdc4e7eb4 100644 --- a/tests/integration/test_data_rows.py +++ b/tests/integration/test_data_rows.py @@ -941,6 +941,7 @@ def test_data_row_bulk_creation_sync_with_same_global_keys( if is_adv_enabled: # ADV will import the first data row but not the second (duplicate global key) assert len(list(dataset.data_rows())) == 1 + assert list(dataset.data_rows())[0].global_key == global_key_1 assert "Some data rows were not imported. Check error output here" in str( exc_info.value) else: diff --git a/tests/integration/test_global_keys.py b/tests/integration/test_global_keys.py index 6b9588c18..25ca7ba33 100644 --- a/tests/integration/test_global_keys.py +++ b/tests/integration/test_global_keys.py @@ -116,8 +116,7 @@ def test_long_global_key_validation(client, dataset, image_url): 'error'] == 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid' -def test_global_key_with_whitespaces_validation(client, dataset, image_url, - is_adv_enabled): +def test_global_key_with_whitespaces_validation(client, dataset, image_url): dr_1 = dataset.create_data_row(row_data=image_url) dr_2 = dataset.create_data_row(row_data=image_url) dr_3 = dataset.create_data_row(row_data=image_url) @@ -138,27 +137,19 @@ def test_global_key_with_whitespaces_validation(client, dataset, image_url, }] res = client.assign_global_keys_to_data_rows(assignment_inputs) - if is_adv_enabled: - assert res['status'] == 'PARTIAL SUCCESS' - assert len(res['results']) == 2 - assert len(res['errors']) == 1 - assert res['errors'][0]['global_key'] == gk_3 - assert res['errors'][0][ - 'error'] == "Invalid assignment. Either DataRow does not exist, or globalKey is invalid" - else: - assert len(res['results']) == 0 - assert len(res['errors']) == 3 - assert res['status'] == 'FAILURE' - assign_errors_ids = set([e['data_row_id'] for e in res['errors']]) - assign_errors_gks = set([e['global_key'] for e in res['errors']]) - assign_errors_msgs = set([e['error'] for e in res['errors']]) - assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid]) - assert assign_errors_gks == set([gk_1, gk_2, gk_3]) - assert assign_errors_msgs == set([ - 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', - 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', - 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid' - ]) + assert len(res['results']) == 0 + assert len(res['errors']) == 3 + assert res['status'] == 'FAILURE' + assign_errors_ids = set([e['data_row_id'] for e in res['errors']]) + assign_errors_gks = set([e['global_key'] for e in res['errors']]) + assign_errors_msgs = set([e['error'] for e in res['errors']]) + assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid]) + assert assign_errors_gks == set([gk_1, gk_2, gk_3]) + assert assign_errors_msgs == set([ + 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', + 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid', + 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid' + ]) def test_get_data_row_ids_for_global_keys(client, dataset, image_url): From 91e1127f42362077d9df8fe15b73f3a43d0be181 Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Mon, 21 Aug 2023 14:26:48 -0700 Subject: [PATCH 08/20] Try and fix test_filtering flaky test by removing dataset query testing part, since it is not applicable to BATCH projects any more --- tests/integration/test_filtering.py | 39 +++-------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/tests/integration/test_filtering.py b/tests/integration/test_filtering.py index 5cd185258..fde7f0638 100644 --- a/tests/integration/test_filtering.py +++ b/tests/integration/test_filtering.py @@ -24,18 +24,15 @@ def project_to_test_where(client, rand_gen): # Avoid assertions using equality to prevent intermittent failures due to # other builds simultaneously adding projects to test org -def test_where(client, image_url, project_to_test_where, rand_gen): +def test_where(client, project_to_test_where): p_a, p_b, p_c = project_to_test_where - p_a_name, p_b_name, p_c_name = [p.name for p in [p_a, p_b, p_c]] + p_a_name, p_b_name, _ = [p.name for p in [p_a, p_b, p_c]] - def _get(f, where=None): + def get(where=None): date_where = Project.created_at >= p_a.created_at where = date_where if where is None else where & date_where return {p.uid for p in client.get_projects(where)} - def get(where=None): - return _get(client.get_projects, where) - assert {p_a.uid, p_b.uid, p_c.uid}.issubset(get()) e_a = get(Project.name == p_a_name) assert p_a.uid in e_a and p_b.uid not in e_a and p_c.uid not in e_a @@ -50,34 +47,6 @@ def get(where=None): le_b = get(Project.name <= p_b_name) assert {p_a.uid, p_b.uid}.issubset(le_b) and p_c.uid not in le_b - dataset = client.create_dataset(name="Dataset") - data_row = dataset.create_data_row(row_data=image_url) - data_row_ids = [data_row.uid] - batch = p_a.create_batch( - rand_gen(str), - data_row_ids, # sample of data row objects - 5 # priority between 1(Highest) - 5(lowest) - ) - - def get(where=None): - return _get(batch.project, where) - - assert {p_a.uid, p_b.uid, p_c.uid}.issubset(get()) - e_a = get(Project.name == p_a_name) - assert p_a.uid in e_a and p_b.uid not in e_a and p_c.uid not in e_a - not_b = get(Project.name != p_b_name) - assert {p_a.uid, p_c.uid}.issubset(not_b) and p_b.uid not in not_b - gt_b = get(Project.name > p_b_name) - assert p_c.uid in gt_b and p_a.uid not in gt_b and p_b.uid not in gt_b - lt_b = get(Project.name < p_b_name) - assert p_a.uid in lt_b and p_b.uid not in lt_b and p_c.uid not in lt_b - ge_b = get(Project.name >= p_b_name) - assert {p_b.uid, p_c.uid}.issubset(ge_b) and p_a.uid not in ge_b - le_b = get(Project.name <= p_b_name) - assert {p_a.uid, p_b.uid}.issubset(le_b) and p_c.uid not in le_b - - batch.delete() - def test_unsupported_where(client): with pytest.raises(InvalidQueryError): @@ -89,4 +58,4 @@ def test_unsupported_where(client): (Project.description == "b")) with pytest.raises(InvalidQueryError): - client.get_projects(where=~(Project.name == "a")) + client.get_projects(where=~(Project.name == "a")) \ No newline at end of file From 48285e46b2ae509a8c1ccdc4141459ddd5ad006a Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Wed, 2 Aug 2023 16:20:23 -0700 Subject: [PATCH 09/20] Add instrumentation for fixtures(temp) --- .../integration/annotation_import/conftest.py | 13 ++++++- tests/integration/conftest.py | 36 +++++++++++++++++++ tests/integration/test_dataset.py | 8 +++-- 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index 6db398fe5..6e35d4d0a 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -9,6 +9,7 @@ from typing import Type from labelbox.schema.labeling_frontend import LabelingFrontend from labelbox.schema.annotation_import import LabelImport, AnnotationImportState +from labelbox.schema.project import Project from labelbox.schema.queue_mode import QueueMode DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS = 40 @@ -486,6 +487,7 @@ def initial_dataset(client, rand_gen): @pytest.fixture def configured_project(client, initial_dataset, ontology, rand_gen, image_url): + start_time = time.time() dataset = initial_dataset project = client.create_project( name=rand_gen(str), @@ -496,14 +498,21 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url): where=LabelingFrontend.name == "editor"))[0] project.setup(editor, ontology) data_row_ids = [] - + # print("Before creating data rows ", time.time() - start_time) + num_rows = 0 for _ in range(len(ontology['tools']) + len(ontology['classifications'])): data_row_ids.append(dataset.create_data_row(row_data=image_url).uid) + num_rows += 1 + # print("After creating data rows ", time.time() - start_time) + + pytest.data_row_report['times'] += time.time() - start_time + pytest.data_row_report['num_rows'] += num_rows project.create_batch( rand_gen(str), data_row_ids, # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) ) + print("After creating batch ", time.time() - start_time) project.data_row_ids = data_row_ids yield project project.delete() @@ -1006,6 +1015,7 @@ def model_run_with_training_metadata(rand_gen, model): @pytest.fixture def model_run_with_data_rows(client, configured_project, model_run_predictions, model_run, wait_for_label_processing): + start_time = time.time() configured_project.enable_model_assisted_labeling() upload_task = LabelImport.create_from_objects( @@ -1019,6 +1029,7 @@ def model_run_with_data_rows(client, configured_project, model_run_predictions, labels = wait_for_label_processing(configured_project) label_ids = [label.uid for label in labels] model_run.upsert_labels(label_ids) + print(f"model_run_with_data_rows: {time.time() - start_time}") yield model_run model_run.delete() # TODO: Delete resources when that is possible .. diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index ed4229b4d..92e23a375 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,3 +1,5 @@ +from collections import defaultdict +from itertools import islice import json import os import re @@ -807,3 +809,37 @@ def upload_invalid_data_rows_for_dataset(dataset: Dataset): }, ] * 2) task.wait_till_done() + + +def pytest_configure(): + pytest.report = defaultdict(int) + pytest.data_row_report = {'times': 0, 'num_rows': 0} + + +@pytest.hookimpl(hookwrapper=True) +def pytest_fixture_setup(fixturedef, request): + start = time.time() + yield + + end = time.time() + + exec_time = end - start + pytest.report[fixturedef.argname] += exec_time + + # print('pytest_fixture_setup' + # f', request={request}' + # f', create_data_row_time={end - start}') + + +@pytest.fixture(scope='session', autouse=True) +def print_perf_summary(): + yield + + sorted_dict = dict( + sorted(pytest.report.items(), key=lambda item: item[1], reverse=True)) + num_of_entries = 10 if len(sorted_dict) >= 10 else len(sorted_dict) + slowest_fixtures = [ + (aaa, sorted_dict[aaa]) for aaa in islice(sorted_dict, num_of_entries) + ] + print("\nTop slowest fixtures:\n", slowest_fixtures) + print("Data row report:\n", pytest.data_row_report) diff --git a/tests/integration/test_dataset.py b/tests/integration/test_dataset.py index d1a31e532..de2f15820 100644 --- a/tests/integration/test_dataset.py +++ b/tests/integration/test_dataset.py @@ -53,8 +53,12 @@ def dataset_for_filtering(client, rand_gen): yield name_1, d1, name_2, d2 - d1.delete() - d2.delete() + +def test_dataset_filtering(client, dataset_for_filtering): + name_1, d1, name_2, d2 = dataset_for_filtering + + assert list(client.get_datasets(where=Dataset.name == name_1)) == [d1] + assert list(client.get_datasets(where=Dataset.name == name_2)) == [d2] def test_dataset_filtering(client, dataset_for_filtering): From b95d1b89767c8ea6f085596315d056c8982b9f99 Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Mon, 14 Aug 2023 09:16:26 -0700 Subject: [PATCH 10/20] Convert tests that do now require many data rows prebuilt to a simpler project fixture --- pytest.ini | 2 +- .../integration/annotation_import/conftest.py | 13 ++--- .../test_bulk_import_request.py | 49 +++++++++---------- .../annotation_import/test_data_types.py | 22 ++++----- tests/integration/conftest.py | 29 +++++++---- tests/integration/test_project.py | 14 ++---- 6 files changed, 64 insertions(+), 65 deletions(-) diff --git a/pytest.ini b/pytest.ini index b56afefdd..fbf64a864 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] -addopts = -s -vv --reruns 5 --reruns-delay 10 --durations=20 +addopts = -s -vv markers = slow: marks tests as slow (deselect with '-m "not slow"') diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index 6e35d4d0a..1f88de47a 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -486,17 +486,12 @@ def initial_dataset(client, rand_gen): @pytest.fixture -def configured_project(client, initial_dataset, ontology, rand_gen, image_url): +def configured_project(client, configured_project_without_data_rows, + initial_dataset, ontology, rand_gen, image_url): start_time = time.time() dataset = initial_dataset - project = client.create_project( - name=rand_gen(str), - queue_mode=QueueMode.Batch, - ) - editor = list( - client.get_labeling_frontends( - where=LabelingFrontend.name == "editor"))[0] - project.setup(editor, ontology) + project = configured_project_without_data_rows + data_row_ids = [] # print("Before creating data rows ", time.time() - start_time) num_rows = 0 diff --git a/tests/integration/annotation_import/test_bulk_import_request.py b/tests/integration/annotation_import/test_bulk_import_request.py index 4f001af8d..7a66dd667 100644 --- a/tests/integration/annotation_import/test_bulk_import_request.py +++ b/tests/integration/annotation_import/test_bulk_import_request.py @@ -25,15 +25,15 @@ """ -def test_create_from_url(configured_project): +def test_create_from_url(project): name = str(uuid.uuid4()) url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson" - bulk_import_request = configured_project.upload_annotations(name=name, - annotations=url, - validate=False) + bulk_import_request = project.upload_annotations(name=name, + annotations=url, + validate=False) - assert bulk_import_request.project() == configured_project + assert bulk_import_request.project() == project assert bulk_import_request.name == name assert bulk_import_request.input_file_url == url assert bulk_import_request.error_file_url is None @@ -41,24 +41,24 @@ def test_create_from_url(configured_project): assert bulk_import_request.state == BulkImportRequestState.RUNNING -def test_validate_file(configured_project): +def test_validate_file(project_with_ontology): name = str(uuid.uuid4()) url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson" with pytest.raises(MALValidationError): - configured_project.upload_annotations(name=name, - annotations=url, - validate=True) + project_with_ontology.upload_annotations(name=name, + annotations=url, + validate=True) #Schema ids shouldn't match -def test_create_from_objects(configured_project, predictions, +def test_create_from_objects(configured_project_without_data_rows, predictions, annotation_import_test_helpers): name = str(uuid.uuid4()) - bulk_import_request = configured_project.upload_annotations( + bulk_import_request = configured_project_without_data_rows.upload_annotations( name=name, annotations=predictions) - assert bulk_import_request.project() == configured_project + assert bulk_import_request.project() == configured_project_without_data_rows assert bulk_import_request.name == name assert bulk_import_request.error_file_url is None assert bulk_import_request.status_file_url is None @@ -105,17 +105,17 @@ def test_create_from_local_file(tmp_path, predictions, configured_project, bulk_import_request.input_file_url, predictions) -def test_get(client, configured_project): +def test_get(client, configured_project_without_data_rows): name = str(uuid.uuid4()) url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson" - configured_project.upload_annotations(name=name, - annotations=url, - validate=False) + configured_project_without_data_rows.upload_annotations(name=name, + annotations=url, + validate=False) bulk_import_request = BulkImportRequest.from_name( - client, project_id=configured_project.uid, name=name) + client, project_id=configured_project_without_data_rows.uid, name=name) - assert bulk_import_request.project() == configured_project + assert bulk_import_request.project() == configured_project_without_data_rows assert bulk_import_request.name == name assert bulk_import_request.input_file_url == url assert bulk_import_request.error_file_url is None @@ -158,14 +158,13 @@ def test_validate_ndjson_uuid(tmp_path, configured_project, predictions): @pytest.mark.slow -def test_wait_till_done(rectangle_inference, configured_project): +def test_wait_till_done(rectangle_inference, + configured_project_without_data_rows): name = str(uuid.uuid4()) - url = configured_project.client.upload_data(content=parser.dumps( - [rectangle_inference]), - sign=True) - bulk_import_request = configured_project.upload_annotations(name=name, - annotations=url, - validate=False) + url = configured_project_without_data_rows.client.upload_data( + content=parser.dumps([rectangle_inference]), sign=True) + bulk_import_request = configured_project_without_data_rows.upload_annotations( + name=name, annotations=url, validate=False) assert len(bulk_import_request.inputs) == 1 bulk_import_request.wait_until_done() diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py index f8e392cf5..30559198b 100644 --- a/tests/integration/annotation_import/test_data_types.py +++ b/tests/integration/annotation_import/test_data_types.py @@ -125,7 +125,6 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name): [data_row.uid], # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) ) - project.data_row_ids.append(data_row.uid) return data_row @@ -135,12 +134,12 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name): AudioData, ConversationData, DicomData, DocumentData, HTMLData, ImageData, TextData ]) -def test_import_data_types(client, configured_project, initial_dataset, - rand_gen, data_row_json_by_data_type, - annotations_by_data_type, data_type_class): +def test_import_data_types(client, project, initial_dataset, rand_gen, + data_row_json_by_data_type, annotations_by_data_type, + data_type_class): - project = configured_project - project_id = configured_project.uid + project = project + project_id = project.uid dataset = initial_dataset set_project_media_type_from_data_type(project, data_type_class) @@ -261,11 +260,11 @@ def test_import_data_types_v2(client, configured_project, initial_dataset, @pytest.mark.parametrize('data_type, data_class, annotations', test_params) -def test_import_label_annotations(client, configured_project, initial_dataset, - data_row_json_by_data_type, data_type, - data_class, annotations, rand_gen): +def test_import_label_annotations(client, configured_project_without_data_rows, + initial_dataset, data_row_json_by_data_type, + data_type, data_class, annotations, rand_gen): - project = configured_project + project = configured_project_without_data_rows dataset = initial_dataset set_project_media_type_from_data_type(project, data_class) @@ -297,7 +296,8 @@ def test_import_label_annotations(client, configured_project, initial_dataset, assert export_task.errors is None expected_annotations = get_annotation_comparison_dicts_from_labels(labels) actual_annotations = get_annotation_comparison_dicts_from_export( - export_task.result, data_row.uid, configured_project.uid) + export_task.result, data_row.uid, + configured_project_without_data_rows.uid) assert actual_annotations == expected_annotations data_row.delete() diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 92e23a375..c47524ed6 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -3,6 +3,7 @@ import json import os import re +import sys import time import uuid from enum import Enum @@ -390,9 +391,21 @@ def initial_dataset(client, rand_gen): @pytest.fixture -def configured_project(project, initial_dataset, client, rand_gen, image_url): +def project_with_ontology(project): + editor = list( + project.client.get_labeling_frontends( + where=LabelingFrontend.name == "editor"))[0] + empty_ontology = {"tools": [], "classifications": []} + project.setup(editor, empty_ontology) + yield project + + +@pytest.fixture +def configured_project(project_with_ontology, initial_dataset, rand_gen, + image_url): dataset = initial_dataset data_row_id = dataset.create_data_row(row_data=image_url).uid + project = project_with_ontology project.create_batch( rand_gen(str), @@ -401,14 +414,7 @@ def configured_project(project, initial_dataset, client, rand_gen, image_url): ) project.data_row_ids = [data_row_id] - editor = list( - project.client.get_labeling_frontends( - where=LabelingFrontend.name == "editor"))[0] - empty_ontology = {"tools": [], "classifications": []} - project.setup(editor, empty_ontology) yield project - dataset.delete() - project.delete() @pytest.fixture @@ -833,6 +839,8 @@ def pytest_fixture_setup(fixturedef, request): @pytest.fixture(scope='session', autouse=True) def print_perf_summary(): + print("Starting measurements\n", file=sys.stderr) + yield sorted_dict = dict( @@ -841,5 +849,6 @@ def print_perf_summary(): slowest_fixtures = [ (aaa, sorted_dict[aaa]) for aaa in islice(sorted_dict, num_of_entries) ] - print("\nTop slowest fixtures:\n", slowest_fixtures) - print("Data row report:\n", pytest.data_row_report) + print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr) + print("Data row report:\n", pytest.data_row_report, file=sys.stderr) + # assert False diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py index b9467e0dd..b3b683a3d 100644 --- a/tests/integration/test_project.py +++ b/tests/integration/test_project.py @@ -171,15 +171,15 @@ def test_attach_instructions(client, project): @pytest.mark.skipif(condition=os.environ['LABELBOX_TEST_ENVIRON'] == "onprem", reason="new mutation does not work for onprem") -def test_html_instructions(configured_project): +def test_html_instructions(project_with_ontology): html_file_path = '/tmp/instructions.html' sample_html_str = "" with open(html_file_path, 'w') as file: file.write(sample_html_str) - configured_project.upsert_instructions(html_file_path) - updated_ontology = configured_project.ontology().normalized + project_with_ontology.upsert_instructions(html_file_path) + updated_ontology = project_with_ontology.ontology().normalized instructions = updated_ontology.pop('projectInstructions') assert requests.get(instructions).text == sample_html_str @@ -200,10 +200,6 @@ def test_same_ontology_after_instructions( assert instructions is not None -def test_queue_mode(configured_project: Project): - assert configured_project.queue_mode == QueueMode.Batch - - def test_batches(project: Project, dataset: Dataset, image_url): task = dataset.create_data_rows([ { @@ -243,9 +239,9 @@ def test_create_batch_with_global_keys_async(project: Project, data_rows): assert batch_data_rows == set(data_rows) -def test_media_type(client, configured_project: Project, rand_gen): +def test_media_type(client, project: Project, rand_gen): # Existing project with no media_type - assert isinstance(configured_project.media_type, MediaType) + assert isinstance(project.media_type, MediaType) # Update test project = client.create_project(name=rand_gen(str)) From 4976908f25ece76697cda14efe952d061f69a92c Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Tue, 15 Aug 2023 15:05:36 -0700 Subject: [PATCH 11/20] Adding an option to configure source of data rows for predictions, also making ndjson test use project without datatows --- .../integration/annotation_import/conftest.py | 81 +++++++++- .../test_ndjson_validation.py | 138 +++++++++++------- tests/integration/conftest.py | 2 - 3 files changed, 158 insertions(+), 63 deletions(-) diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index 1f88de47a..ca34d2dfb 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -486,8 +486,27 @@ def initial_dataset(client, rand_gen): @pytest.fixture -def configured_project(client, configured_project_without_data_rows, - initial_dataset, ontology, rand_gen, image_url): +def hardcoded_datarow_id(): + data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q' + + def get_data_row_id(indx=0): + return data_row_id + + yield get_data_row_id + + +@pytest.fixture +def configured_project_datarow_id(configured_project): + + def get_data_row_id(indx=0): + return configured_project.data_row_ids[indx] + + yield get_data_row_id + + +@pytest.fixture +def configured_project(configured_project_without_data_rows, initial_dataset, + ontology, rand_gen, image_url): start_time = time.time() dataset = initial_dataset project = configured_project_without_data_rows @@ -509,6 +528,7 @@ def configured_project(client, configured_project_without_data_rows, ) print("After creating batch ", time.time() - start_time) project.data_row_ids = data_row_ids + yield project project.delete() @@ -577,10 +597,19 @@ def configured_project_without_data_rows(client, ontology, rand_gen): # In an example of a 'rectangle' we have extended to support multiple instances of the same tool type # TODO: we will support this approach in the future for all tools @pytest.fixture -def prediction_id_mapping(configured_project): +def prediction_id_mapping(configured_project_without_data_rows, ontology, + request): # Maps tool types to feature schema ids - project = configured_project + if 'configured_project' in request.fixturenames: + data_row_id_factory = request.getfixturevalue( + 'configured_project_datarow_id') + project = configured_project + else: + data_row_id_factory = request.getfixturevalue('hardcoded_datarow_id') + project = configured_project_without_data_rows + ontology = project.ontology().normalized + result = {} for idx, tool in enumerate(ontology['tools'] + ontology['classifications']): @@ -597,7 +626,47 @@ def prediction_id_mapping(configured_project): "schemaId": tool['featureSchemaId'], "name": tool['name'], "dataRow": { - "id": project.data_row_ids[idx], + "id": data_row_id_factory(idx), + }, + 'tool': tool + } + if tool_type not in result: + result[tool_type] = [] + result[tool_type].append(value) + else: + result[tool_type] = { + "uuid": str(uuid.uuid4()), + "schemaId": tool['featureSchemaId'], + "name": tool['name'], + "dataRow": { + "id": data_row_id_factory(idx), + }, + 'tool': tool + } + return result + + +@pytest.fixture +def prediction_id_mapping_datarow_id(): + # Maps tool types to feature schema ids + data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q' + result = {} + + for _, tool in enumerate(ontology['tools'] + ontology['classifications']): + if 'tool' in tool: + tool_type = tool['tool'] + else: + tool_type = tool[ + 'type'] if 'scope' not in tool else f"{tool['type']}_{tool['scope']}" # so 'checklist' of 'checklist_index' + + # TODO: remove this once we have a better way to associate multiple tools instances with a single tool type + if tool_type == 'rectangle': + value = { + "uuid": str(uuid.uuid4()), + "schemaId": tool['featureSchemaId'], + "name": tool['name'], + "dataRow": { + "id": data_row_id, }, 'tool': tool } @@ -610,7 +679,7 @@ def prediction_id_mapping(configured_project): "schemaId": tool['featureSchemaId'], "name": tool['name'], "dataRow": { - "id": project.data_row_ids[idx], + "id": data_row_id, }, 'tool': tool } diff --git a/tests/integration/annotation_import/test_ndjson_validation.py b/tests/integration/annotation_import/test_ndjson_validation.py index 53bb85eed..466968e85 100644 --- a/tests/integration/annotation_import/test_ndjson_validation.py +++ b/tests/integration/annotation_import/test_ndjson_validation.py @@ -9,6 +9,24 @@ NDRadio, NDRectangle, NDText, NDTextEntity, NDTool, _validate_ndjson) +from labelbox.schema.labeling_frontend import LabelingFrontend +from labelbox.schema.queue_mode import QueueMode + + +@pytest.fixture +def configured_project_without_data_rows(client, + ontology, + rand_gen, + scope="module"): + project = client.create_project(name=rand_gen(str), + description=rand_gen(str), + queue_mode=QueueMode.Batch) + editor = list( + client.get_labeling_frontends( + where=LabelingFrontend.name == "editor"))[0] + project.setup(editor, ontology) + yield project + project.delete() def test_classification_construction(checklist_inference, text_inference): @@ -37,187 +55,198 @@ def test_tool_construction(inference, expected_type): def test_incorrect_feature_schema(rectangle_inference, polygon_inference, - configured_project): + configured_project_without_data_rows): #Valid but incorrect feature schema #Prob the error message says something about the config not anything useful. We might want to fix this. pred = rectangle_inference.copy() pred['schemaId'] = polygon_inference['schemaId'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def no_tool(text_inference, configured_project): +def no_tool(text_inference, configured_project_without_data_rows): pred = text_inference.copy() #Missing key del pred['answer'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_invalid_text(text_inference, configured_project): +def test_invalid_text(text_inference, configured_project_without_data_rows): #and if it is not a string pred = text_inference.copy() #Extra and wrong key del pred['answer'] pred['answers'] = [] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) del pred['answers'] #Invalid type pred['answer'] = [] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) #Invalid type pred['answer'] = None with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_invalid_checklist_item(checklist_inference, configured_project): +def test_invalid_checklist_item(checklist_inference, + configured_project_without_data_rows): #Only two points pred = checklist_inference.copy() pred['answers'] = [pred['answers'][0], pred['answers'][0]] #Duplicate schema ids with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) pred['answers'] = [{"name": "asdfg"}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) pred['answers'] = [{"schemaId": "1232132132"}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) pred['answers'] = [{}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) pred['answers'] = [] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) del pred['answers'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_invalid_polygon(polygon_inference, configured_project): +def test_invalid_polygon(polygon_inference, + configured_project_without_data_rows): #Only two points pred = polygon_inference.copy() pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_incorrect_entity(entity_inference, configured_project): +def test_incorrect_entity(entity_inference, + configured_project_without_data_rows): entity = entity_inference.copy() #Location cannot be a list entity["location"] = [0, 10] with pytest.raises(MALValidationError): - _validate_ndjson([entity], configured_project) + _validate_ndjson([entity], configured_project_without_data_rows) entity["location"] = {"start": -1, "end": 5} with pytest.raises(MALValidationError): - _validate_ndjson([entity], configured_project) + _validate_ndjson([entity], configured_project_without_data_rows) entity["location"] = {"start": 15, "end": 5} with pytest.raises(MALValidationError): - _validate_ndjson([entity], configured_project) + _validate_ndjson([entity], configured_project_without_data_rows) -def test_incorrect_mask(segmentation_inference, configured_project): +def test_incorrect_mask(segmentation_inference, + configured_project_without_data_rows): seg = segmentation_inference.copy() seg['mask']['colorRGB'] = [-1, 0, 10] with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project) + _validate_ndjson([seg], configured_project_without_data_rows) seg['mask']['colorRGB'] = [0, 0] with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project) + _validate_ndjson([seg], configured_project_without_data_rows) seg['mask'] = {'counts': [0], 'size': [0, 1]} with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project) + _validate_ndjson([seg], configured_project_without_data_rows) seg['mask'] = {'counts': [-1], 'size': [1, 1]} with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project) + _validate_ndjson([seg], configured_project_without_data_rows) -def test_all_validate_json(configured_project, predictions): +def test_all_validate_json(configured_project_without_data_rows, predictions): #Predictions contains one of each type of prediction. #These should be properly formatted and pass. - _validate_ndjson(predictions, configured_project) + _validate_ndjson(predictions, configured_project_without_data_rows) -def test_incorrect_line(line_inference, configured_project): +def test_incorrect_line(line_inference, configured_project_without_data_rows): line = line_inference.copy() line["line"] = [line["line"][0]] #Just one point with pytest.raises(MALValidationError): - _validate_ndjson([line], configured_project) + _validate_ndjson([line], configured_project_without_data_rows) -def test_incorrect_rectangle(rectangle_inference, configured_project): +def test_incorrect_rectangle(rectangle_inference, + configured_project_without_data_rows): del rectangle_inference['bbox']['top'] with pytest.raises(MALValidationError): - _validate_ndjson([rectangle_inference], configured_project) + _validate_ndjson([rectangle_inference], + configured_project_without_data_rows) -def test_duplicate_tools(rectangle_inference, configured_project): +def test_duplicate_tools(rectangle_inference, + configured_project_without_data_rows): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_invalid_feature_schema(configured_project, rectangle_inference): +def test_invalid_feature_schema(configured_project_without_data_rows, + rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() pred['schemaId'] = "blahblah" with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_name_only_feature_schema(configured_project, rectangle_inference): +def test_name_only_feature_schema(configured_project_without_data_rows, + rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() del pred['schemaId'] - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_schema_id_only_feature_schema(configured_project, rectangle_inference): +def test_schema_id_only_feature_schema(configured_project_without_data_rows, + rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() del pred['name'] - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_missing_feature_schema(configured_project, rectangle_inference): +def test_missing_feature_schema(configured_project_without_data_rows, + rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() del pred['schemaId'] del pred['name'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) -def test_validate_ndjson(tmp_path, configured_project): +def test_validate_ndjson(tmp_path, configured_project_without_data_rows): file_name = f"broken.ndjson" file_path = tmp_path / file_name with file_path.open("w") as f: f.write("test") with pytest.raises(ValueError): - configured_project.upload_annotations(name="name", - annotations=str(file_path), - validate=True) + configured_project_without_data_rows.upload_annotations( + name="name", annotations=str(file_path), validate=True) -def test_validate_ndjson_uuid(tmp_path, configured_project, predictions): +def test_validate_ndjson_uuid(tmp_path, configured_project_without_data_rows, + predictions): file_name = f"repeat_uuid.ndjson" file_path = tmp_path / file_name repeat_uuid = predictions.copy() @@ -228,16 +257,15 @@ def test_validate_ndjson_uuid(tmp_path, configured_project, predictions): parser.dump(repeat_uuid, f) with pytest.raises(MALValidationError): - configured_project.upload_annotations(name="name", - validate=True, - annotations=str(file_path)) + configured_project_without_data_rows.upload_annotations( + name="name", validate=True, annotations=str(file_path)) with pytest.raises(MALValidationError): - configured_project.upload_annotations(name="name", - validate=True, - annotations=repeat_uuid) + configured_project_without_data_rows.upload_annotations( + name="name", validate=True, annotations=repeat_uuid) -def test_video_upload(video_checklist_inference, configured_project): +def test_video_upload(video_checklist_inference, + configured_project_without_data_rows): pred = video_checklist_inference.copy() - _validate_ndjson([pred], configured_project) + _validate_ndjson([pred], configured_project_without_data_rows) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c47524ed6..82c739ddb 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -839,8 +839,6 @@ def pytest_fixture_setup(fixturedef, request): @pytest.fixture(scope='session', autouse=True) def print_perf_summary(): - print("Starting measurements\n", file=sys.stderr) - yield sorted_dict = dict( From 551c1efe0cc87b60c1b2f6509f613b75cb432d4b Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Tue, 15 Aug 2023 15:24:56 -0700 Subject: [PATCH 12/20] Replacing configured_project --- pytest.ini | 2 +- tests/conftest.py | 2 +- .../integration/annotation_import/conftest.py | 120 ++++++++--------- .../test_bulk_import_request.py | 41 +++--- .../test_conversation_import.py | 6 +- .../annotation_import/test_data_types.py | 23 ++-- .../annotation_import/test_label_import.py | 19 ++- .../annotation_import/test_model.py | 4 +- .../annotation_import/test_model_run.py | 14 +- .../test_ndjson_validation.py | 121 +++++++++--------- .../test_upsert_prediction_import.py | 18 +-- tests/integration/conftest.py | 15 ++- tests/integration/export_v2/conftest.py | 2 +- .../export_v2/test_export_video.py | 4 +- 14 files changed, 194 insertions(+), 197 deletions(-) diff --git a/pytest.ini b/pytest.ini index fbf64a864..b56afefdd 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] -addopts = -s -vv +addopts = -s -vv --reruns 5 --reruns-delay 10 --durations=20 markers = slow: marks tests as slow (deselect with '-m "not slow"') diff --git a/tests/conftest.py b/tests/conftest.py index b4dd6dce0..b724426d8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,7 @@ ] -@pytest.fixture +@pytest.fixture(scope="session") def rand_gen(): def gen(field_type): diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index ca34d2dfb..3f1cd7de5 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -211,7 +211,7 @@ def annotations_by_data_type_v2( } -@pytest.fixture +@pytest.fixture(scope='session') def ontology(): bbox_tool_with_nested_text = { 'required': @@ -479,48 +479,45 @@ def func(project): @pytest.fixture -def initial_dataset(client, rand_gen): - dataset = client.create_dataset(name=rand_gen(str)) - yield dataset - dataset.delete() - - -@pytest.fixture -def hardcoded_datarow_id(): - data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q' +def configured_project_datarow_id(configured_project): def get_data_row_id(indx=0): - return data_row_id + return configured_project.data_row_ids[indx] yield get_data_row_id @pytest.fixture -def configured_project_datarow_id(configured_project): +def configured_project_one_datarow_id(configured_project_with_one_data_row): def get_data_row_id(indx=0): - return configured_project.data_row_ids[indx] + return configured_project_with_one_data_row.data_row_ids[0] yield get_data_row_id @pytest.fixture -def configured_project(configured_project_without_data_rows, initial_dataset, - ontology, rand_gen, image_url): +def configured_project(client, initial_dataset, ontology, rand_gen, image_url): start_time = time.time() dataset = initial_dataset - project = configured_project_without_data_rows + project = client.create_project(name=rand_gen(str), + queue_mode=QueueMode.Batch) + editor = list( + client.get_labeling_frontends( + where=LabelingFrontend.name == "editor"))[0] + project.setup(editor, ontology) + num_rows = 0 data_row_ids = [] - # print("Before creating data rows ", time.time() - start_time) - num_rows = 0 + for _ in range(len(ontology['tools']) + len(ontology['classifications'])): data_row_ids.append(dataset.create_data_row(row_data=image_url).uid) num_rows += 1 - # print("After creating data rows ", time.time() - start_time) - - pytest.data_row_report['times'] += time.time() - start_time - pytest.data_row_report['num_rows'] += num_rows + project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids, + sleep_interval=3) + if pytest.data_row_report: + pytest.data_row_report['times'] += time.time() - start_time + pytest.data_row_report['num_rows'] += num_rows project.create_batch( rand_gen(str), data_row_ids, # sample of data row objects @@ -580,7 +577,10 @@ def dataset_conversation_entity(client, rand_gen, conversation_entity_data_row, @pytest.fixture -def configured_project_without_data_rows(client, ontology, rand_gen): +def configured_project_with_one_data_row(client, ontology, rand_gen, + initial_dataset, image_url): + start_time = time.time() + project = client.create_project(name=rand_gen(str), description=rand_gen(str), queue_mode=QueueMode.Batch) @@ -588,7 +588,25 @@ def configured_project_without_data_rows(client, ontology, rand_gen): client.get_labeling_frontends( where=LabelingFrontend.name == "editor"))[0] project.setup(editor, ontology) + + data_row = initial_dataset.create_data_row(row_data=image_url) + data_row_ids = [data_row.uid] + project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids, + sleep_interval=3) + + if pytest.data_row_report: + pytest.data_row_report['times'] += time.time() - start_time + pytest.data_row_report['num_rows'] += 1 + batch = project.create_batch( + rand_gen(str), + data_row_ids, # sample of data row objects + 5 # priority between 1(Highest) - 5(lowest) + ) + project.data_row_ids = data_row_ids + yield project + + batch.delete() project.delete() @@ -597,16 +615,20 @@ def configured_project_without_data_rows(client, ontology, rand_gen): # In an example of a 'rectangle' we have extended to support multiple instances of the same tool type # TODO: we will support this approach in the future for all tools @pytest.fixture -def prediction_id_mapping(configured_project_without_data_rows, ontology, - request): +def prediction_id_mapping(ontology, request): # Maps tool types to feature schema ids if 'configured_project' in request.fixturenames: data_row_id_factory = request.getfixturevalue( 'configured_project_datarow_id') - project = configured_project - else: + project = request.getfixturevalue('configured_project') + elif 'hardcoded_datarow_id' in request.fixturenames: data_row_id_factory = request.getfixturevalue('hardcoded_datarow_id') - project = configured_project_without_data_rows + project = request.getfixturevalue('configured_project_with_ontology') + else: + data_row_id_factory = request.getfixturevalue( + 'configured_project_one_datarow_id') + project = request.getfixturevalue( + 'configured_project_with_one_data_row') ontology = project.ontology().normalized @@ -646,46 +668,6 @@ def prediction_id_mapping(configured_project_without_data_rows, ontology, return result -@pytest.fixture -def prediction_id_mapping_datarow_id(): - # Maps tool types to feature schema ids - data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q' - result = {} - - for _, tool in enumerate(ontology['tools'] + ontology['classifications']): - if 'tool' in tool: - tool_type = tool['tool'] - else: - tool_type = tool[ - 'type'] if 'scope' not in tool else f"{tool['type']}_{tool['scope']}" # so 'checklist' of 'checklist_index' - - # TODO: remove this once we have a better way to associate multiple tools instances with a single tool type - if tool_type == 'rectangle': - value = { - "uuid": str(uuid.uuid4()), - "schemaId": tool['featureSchemaId'], - "name": tool['name'], - "dataRow": { - "id": data_row_id, - }, - 'tool': tool - } - if tool_type not in result: - result[tool_type] = [] - result[tool_type].append(value) - else: - result[tool_type] = { - "uuid": str(uuid.uuid4()), - "schemaId": tool['featureSchemaId'], - "name": tool['name'], - "dataRow": { - "id": data_row_id, - }, - 'tool': tool - } - return result - - @pytest.fixture def polygon_inference(prediction_id_mapping): polygon = prediction_id_mapping['polygon'].copy() @@ -1079,7 +1061,6 @@ def model_run_with_training_metadata(rand_gen, model): @pytest.fixture def model_run_with_data_rows(client, configured_project, model_run_predictions, model_run, wait_for_label_processing): - start_time = time.time() configured_project.enable_model_assisted_labeling() upload_task = LabelImport.create_from_objects( @@ -1093,7 +1074,6 @@ def model_run_with_data_rows(client, configured_project, model_run_predictions, labels = wait_for_label_processing(configured_project) label_ids = [label.uid for label in labels] model_run.upsert_labels(label_ids) - print(f"model_run_with_data_rows: {time.time() - start_time}") yield model_run model_run.delete() # TODO: Delete resources when that is possible .. diff --git a/tests/integration/annotation_import/test_bulk_import_request.py b/tests/integration/annotation_import/test_bulk_import_request.py index 7a66dd667..6691cc044 100644 --- a/tests/integration/annotation_import/test_bulk_import_request.py +++ b/tests/integration/annotation_import/test_bulk_import_request.py @@ -51,14 +51,14 @@ def test_validate_file(project_with_ontology): #Schema ids shouldn't match -def test_create_from_objects(configured_project_without_data_rows, predictions, +def test_create_from_objects(configured_project_with_one_data_row, predictions, annotation_import_test_helpers): name = str(uuid.uuid4()) - bulk_import_request = configured_project_without_data_rows.upload_annotations( + bulk_import_request = configured_project_with_one_data_row.upload_annotations( name=name, annotations=predictions) - assert bulk_import_request.project() == configured_project_without_data_rows + assert bulk_import_request.project() == configured_project_with_one_data_row assert bulk_import_request.name == name assert bulk_import_request.error_file_url is None assert bulk_import_request.status_file_url is None @@ -105,17 +105,17 @@ def test_create_from_local_file(tmp_path, predictions, configured_project, bulk_import_request.input_file_url, predictions) -def test_get(client, configured_project_without_data_rows): +def test_get(client, configured_project_with_one_data_row): name = str(uuid.uuid4()) url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson" - configured_project_without_data_rows.upload_annotations(name=name, + configured_project_with_one_data_row.upload_annotations(name=name, annotations=url, validate=False) bulk_import_request = BulkImportRequest.from_name( - client, project_id=configured_project_without_data_rows.uid, name=name) + client, project_id=configured_project_with_one_data_row.uid, name=name) - assert bulk_import_request.project() == configured_project_without_data_rows + assert bulk_import_request.project() == configured_project_with_one_data_row assert bulk_import_request.name == name assert bulk_import_request.input_file_url == url assert bulk_import_request.error_file_url is None @@ -123,16 +123,15 @@ def test_get(client, configured_project_without_data_rows): assert bulk_import_request.state == BulkImportRequestState.RUNNING -def test_validate_ndjson(tmp_path, configured_project): +def test_validate_ndjson(tmp_path, configured_project_with_one_data_row): file_name = f"broken.ndjson" file_path = tmp_path / file_name with file_path.open("w") as f: f.write("test") with pytest.raises(ValueError): - configured_project.upload_annotations(name="name", - validate=True, - annotations=str(file_path)) + configured_project_with_one_data_row.upload_annotations( + name="name", validate=True, annotations=str(file_path)) def test_validate_ndjson_uuid(tmp_path, configured_project, predictions): @@ -159,11 +158,11 @@ def test_validate_ndjson_uuid(tmp_path, configured_project, predictions): @pytest.mark.slow def test_wait_till_done(rectangle_inference, - configured_project_without_data_rows): + configured_project_with_one_data_row): name = str(uuid.uuid4()) - url = configured_project_without_data_rows.client.upload_data( + url = configured_project_with_one_data_row.client.upload_data( content=parser.dumps([rectangle_inference]), sign=True) - bulk_import_request = configured_project_without_data_rows.upload_annotations( + bulk_import_request = configured_project_with_one_data_row.upload_annotations( name=name, annotations=url, validate=False) assert len(bulk_import_request.inputs) == 1 @@ -298,7 +297,7 @@ def test_pdf_mal_bbox(client, configured_project_pdf): assert import_annotations.errors == [] -def test_pdf_document_entity(client, configured_project_without_data_rows, +def test_pdf_document_entity(client, configured_project_with_one_data_row, dataset_pdf_entity, rand_gen): # for content "Metal-insulator (MI) transitions have been one of the" in OCR JSON extract tests/assets/arxiv-pdf_data_99-word-token-pdfs_0801.3483-lb-textlayer.json document_text_selection = DocumentTextSelection( @@ -322,7 +321,7 @@ def test_pdf_document_entity(client, configured_project_without_data_rows, labels = [] _, data_row_uids = dataset_pdf_entity - configured_project_without_data_rows.create_batch( + configured_project_with_one_data_row.create_batch( rand_gen(str), data_row_uids, # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) @@ -337,7 +336,7 @@ def test_pdf_document_entity(client, configured_project_without_data_rows, import_annotations = MALPredictionImport.create_from_objects( client=client, - project_id=configured_project_without_data_rows.uid, + project_id=configured_project_with_one_data_row.uid, name=f"import {str(uuid.uuid4())}", predictions=labels) import_annotations.wait_until_done() @@ -346,14 +345,14 @@ def test_pdf_document_entity(client, configured_project_without_data_rows, def test_nested_video_object_annotations(client, - configured_project_without_data_rows, + configured_project_with_one_data_row, video_data, bbox_video_annotation_objects, rand_gen): labels = [] _, data_row_uids = video_data - configured_project_without_data_rows.update(media_type=MediaType.Video) - configured_project_without_data_rows.create_batch( + configured_project_with_one_data_row.update(media_type=MediaType.Video) + configured_project_with_one_data_row.create_batch( rand_gen(str), data_row_uids, # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) @@ -365,7 +364,7 @@ def test_nested_video_object_annotations(client, annotations=bbox_video_annotation_objects)) import_annotations = MALPredictionImport.create_from_objects( client=client, - project_id=configured_project_without_data_rows.uid, + project_id=configured_project_with_one_data_row.uid, name=f"import {str(uuid.uuid4())}", predictions=labels) import_annotations.wait_until_done() diff --git a/tests/integration/annotation_import/test_conversation_import.py b/tests/integration/annotation_import/test_conversation_import.py index ac2d5419c..9f1d26e31 100644 --- a/tests/integration/annotation_import/test_conversation_import.py +++ b/tests/integration/annotation_import/test_conversation_import.py @@ -7,7 +7,7 @@ from labelbox.schema.annotation_import import MALPredictionImport -def test_conversation_entity(client, configured_project_without_data_rows, +def test_conversation_entity(client, configured_project_with_one_data_row, dataset_conversation_entity, rand_gen): conversation_entity_annotation = ConversationEntity(start=0, @@ -20,7 +20,7 @@ def test_conversation_entity(client, configured_project_without_data_rows, labels = [] _, data_row_uids = dataset_conversation_entity - configured_project_without_data_rows.create_batch( + configured_project_with_one_data_row.create_batch( rand_gen(str), data_row_uids, # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) @@ -35,7 +35,7 @@ def test_conversation_entity(client, configured_project_without_data_rows, import_annotations = MALPredictionImport.create_from_objects( client=client, - project_id=configured_project_without_data_rows.uid, + project_id=configured_project_with_one_data_row.uid, name=f"import {str(uuid.uuid4())}", predictions=labels) diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py index 30559198b..5de79f5cc 100644 --- a/tests/integration/annotation_import/test_data_types.py +++ b/tests/integration/annotation_import/test_data_types.py @@ -125,6 +125,7 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name): [data_row.uid], # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) ) + project.data_row_ids.append(data_row.uid) return data_row @@ -134,11 +135,11 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name): AudioData, ConversationData, DicomData, DocumentData, HTMLData, ImageData, TextData ]) -def test_import_data_types(client, project, initial_dataset, rand_gen, - data_row_json_by_data_type, annotations_by_data_type, - data_type_class): +def test_import_data_types(client, configured_project, initial_dataset, + rand_gen, data_row_json_by_data_type, + annotations_by_data_type, data_type_class): - project = project + project = configured_project project_id = project.uid dataset = initial_dataset @@ -260,11 +261,11 @@ def test_import_data_types_v2(client, configured_project, initial_dataset, @pytest.mark.parametrize('data_type, data_class, annotations', test_params) -def test_import_label_annotations(client, configured_project_without_data_rows, +def test_import_label_annotations(client, configured_project_with_one_data_row, initial_dataset, data_row_json_by_data_type, data_type, data_class, annotations, rand_gen): - project = configured_project_without_data_rows + project = configured_project_with_one_data_row dataset = initial_dataset set_project_media_type_from_data_type(project, data_class) @@ -297,13 +298,13 @@ def test_import_label_annotations(client, configured_project_without_data_rows, expected_annotations = get_annotation_comparison_dicts_from_labels(labels) actual_annotations = get_annotation_comparison_dicts_from_export( export_task.result, data_row.uid, - configured_project_without_data_rows.uid) + configured_project_with_one_data_row.uid) assert actual_annotations == expected_annotations data_row.delete() @pytest.mark.parametrize('data_type, data_class, annotations', test_params) -def test_import_mal_annotations(client, configured_project_without_data_rows, +def test_import_mal_annotations(client, configured_project_with_one_data_row, data_row_json_by_data_type, data_type, data_class, annotations, rand_gen): @@ -311,10 +312,10 @@ def test_import_mal_annotations(client, configured_project_without_data_rows, data_row_json = data_row_json_by_data_type[data_type] data_row = dataset.create_data_row(data_row_json) - set_project_media_type_from_data_type(configured_project_without_data_rows, + set_project_media_type_from_data_type(configured_project_with_one_data_row, data_class) - configured_project_without_data_rows.create_batch( + configured_project_with_one_data_row.create_batch( rand_gen(str), [data_row.uid], ) @@ -326,7 +327,7 @@ def test_import_mal_annotations(client, configured_project_without_data_rows, import_annotations = lb.MALPredictionImport.create_from_objects( client=client, - project_id=configured_project_without_data_rows.uid, + project_id=configured_project_with_one_data_row.uid, name=f"import {str(uuid.uuid4())}", predictions=labels) import_annotations.wait_until_done() diff --git a/tests/integration/annotation_import/test_label_import.py b/tests/integration/annotation_import/test_label_import.py index 198ce2e3e..61c602c52 100644 --- a/tests/integration/annotation_import/test_label_import.py +++ b/tests/integration/annotation_import/test_label_import.py @@ -9,13 +9,16 @@ """ -def test_create_from_url(client, configured_project, +def test_create_from_url(client, configured_project_with_one_data_row, annotation_import_test_helpers): name = str(uuid.uuid4()) url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson" label_import = LabelImport.create_from_url( - client=client, project_id=configured_project.uid, name=name, url=url) - assert label_import.parent_id == configured_project.uid + client=client, + project_id=configured_project_with_one_data_row.uid, + name=name, + url=url) + assert label_import.parent_id == configured_project_with_one_data_row.uid annotation_import_test_helpers.check_running_state(label_import, name, url) @@ -52,13 +55,17 @@ def test_create_from_objects(client, configured_project, object_predictions, # annotation_import_test_helpers.assert_file_content(label_import.input_file_url, object_predictions) -def test_get(client, configured_project, annotation_import_test_helpers): +def test_get(client, configured_project_with_one_data_row, + annotation_import_test_helpers): name = str(uuid.uuid4()) url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson" label_import = LabelImport.create_from_url( - client=client, project_id=configured_project.uid, name=name, url=url) + client=client, + project_id=configured_project_with_one_data_row.uid, + name=name, + url=url) - assert label_import.parent_id == configured_project.uid + assert label_import.parent_id == configured_project_with_one_data_row.uid annotation_import_test_helpers.check_running_state(label_import, name, url) diff --git a/tests/integration/annotation_import/test_model.py b/tests/integration/annotation_import/test_model.py index dcfe9ef2c..131ecd9d0 100644 --- a/tests/integration/annotation_import/test_model.py +++ b/tests/integration/annotation_import/test_model.py @@ -4,14 +4,14 @@ from labelbox.exceptions import ResourceNotFoundError -def test_model(client, configured_project, rand_gen): +def test_model(client, configured_project_with_one_data_row, rand_gen): # Get all models = list(client.get_models()) for m in models: assert isinstance(m, Model) # Create - ontology = configured_project.ontology() + ontology = configured_project_with_one_data_row.ontology() data = {"name": rand_gen(str), "ontology_id": ontology.uid} model = client.create_model(data["name"], data["ontology_id"]) assert model.name == data["name"] diff --git a/tests/integration/annotation_import/test_model_run.py b/tests/integration/annotation_import/test_model_run.py index c94c78cde..328b38ba5 100644 --- a/tests/integration/annotation_import/test_model_run.py +++ b/tests/integration/annotation_import/test_model_run.py @@ -87,11 +87,12 @@ def test_model_run_data_rows_delete(model_run_with_data_rows): assert len(before) == len(after) + 1 -def test_model_run_upsert_data_rows(dataset, model_run, configured_project): +def test_model_run_upsert_data_rows(dataset, model_run, + configured_project_with_one_data_row): n_model_run_data_rows = len(list(model_run.model_run_data_rows())) assert n_model_run_data_rows == 0 data_row = dataset.create_data_row(row_data="test row data") - configured_project._wait_until_data_rows_are_processed( + configured_project_with_one_data_row._wait_until_data_rows_are_processed( data_row_ids=[data_row.uid]) model_run.upsert_data_rows([data_row.uid]) n_model_run_data_rows = len(list(model_run.model_run_data_rows())) @@ -167,15 +168,14 @@ def get_model_run_status(): errorMessage) -def test_model_run_split_assignment_by_data_row_ids(model_run, dataset, - image_url, - configured_project): - n_data_rows = 10 +def test_model_run_split_assignment_by_data_row_ids( + model_run, dataset, image_url, configured_project_with_one_data_row): + n_data_rows = 2 data_rows = dataset.create_data_rows([{ "row_data": image_url } for _ in range(n_data_rows)]) data_row_ids = [data_row['id'] for data_row in data_rows.result] - configured_project._wait_until_data_rows_are_processed( + configured_project_with_one_data_row._wait_until_data_rows_are_processed( data_row_ids=data_row_ids) model_run.upsert_data_rows(data_row_ids) diff --git a/tests/integration/annotation_import/test_ndjson_validation.py b/tests/integration/annotation_import/test_ndjson_validation.py index 466968e85..123752402 100644 --- a/tests/integration/annotation_import/test_ndjson_validation.py +++ b/tests/integration/annotation_import/test_ndjson_validation.py @@ -13,19 +13,27 @@ from labelbox.schema.queue_mode import QueueMode -@pytest.fixture -def configured_project_without_data_rows(client, - ontology, - rand_gen, - scope="module"): +@pytest.fixture(scope="module", autouse=True) +def hardcoded_datarow_id(): + data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q' + + def get_data_row_id(indx=0): + return data_row_id + + yield get_data_row_id + + +@pytest.fixture(scope="module", autouse=True) +def configured_project_with_ontology(client, ontology, rand_gen): project = client.create_project(name=rand_gen(str), - description=rand_gen(str), queue_mode=QueueMode.Batch) editor = list( client.get_labeling_frontends( where=LabelingFrontend.name == "editor"))[0] project.setup(editor, ontology) + yield project + project.delete() @@ -55,197 +63,194 @@ def test_tool_construction(inference, expected_type): def test_incorrect_feature_schema(rectangle_inference, polygon_inference, - configured_project_without_data_rows): + configured_project_with_ontology): #Valid but incorrect feature schema #Prob the error message says something about the config not anything useful. We might want to fix this. pred = rectangle_inference.copy() pred['schemaId'] = polygon_inference['schemaId'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def no_tool(text_inference, configured_project_without_data_rows): +def no_tool(text_inference, configured_project_with_ontology): pred = text_inference.copy() #Missing key del pred['answer'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_invalid_text(text_inference, configured_project_without_data_rows): +def test_invalid_text(text_inference, configured_project_with_ontology): #and if it is not a string pred = text_inference.copy() #Extra and wrong key del pred['answer'] pred['answers'] = [] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) del pred['answers'] #Invalid type pred['answer'] = [] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) #Invalid type pred['answer'] = None with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) def test_invalid_checklist_item(checklist_inference, - configured_project_without_data_rows): + configured_project_with_ontology): #Only two points pred = checklist_inference.copy() pred['answers'] = [pred['answers'][0], pred['answers'][0]] #Duplicate schema ids with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) pred['answers'] = [{"name": "asdfg"}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) pred['answers'] = [{"schemaId": "1232132132"}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) pred['answers'] = [{}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) pred['answers'] = [] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) del pred['answers'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_invalid_polygon(polygon_inference, - configured_project_without_data_rows): +def test_invalid_polygon(polygon_inference, configured_project_with_ontology): #Only two points pred = polygon_inference.copy() pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_incorrect_entity(entity_inference, - configured_project_without_data_rows): +def test_incorrect_entity(entity_inference, configured_project_with_ontology): entity = entity_inference.copy() #Location cannot be a list entity["location"] = [0, 10] with pytest.raises(MALValidationError): - _validate_ndjson([entity], configured_project_without_data_rows) + _validate_ndjson([entity], configured_project_with_ontology) entity["location"] = {"start": -1, "end": 5} with pytest.raises(MALValidationError): - _validate_ndjson([entity], configured_project_without_data_rows) + _validate_ndjson([entity], configured_project_with_ontology) entity["location"] = {"start": 15, "end": 5} with pytest.raises(MALValidationError): - _validate_ndjson([entity], configured_project_without_data_rows) + _validate_ndjson([entity], configured_project_with_ontology) def test_incorrect_mask(segmentation_inference, - configured_project_without_data_rows): + configured_project_with_ontology): seg = segmentation_inference.copy() seg['mask']['colorRGB'] = [-1, 0, 10] with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project_without_data_rows) + _validate_ndjson([seg], configured_project_with_ontology) seg['mask']['colorRGB'] = [0, 0] with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project_without_data_rows) + _validate_ndjson([seg], configured_project_with_ontology) seg['mask'] = {'counts': [0], 'size': [0, 1]} with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project_without_data_rows) + _validate_ndjson([seg], configured_project_with_ontology) seg['mask'] = {'counts': [-1], 'size': [1, 1]} with pytest.raises(MALValidationError): - _validate_ndjson([seg], configured_project_without_data_rows) + _validate_ndjson([seg], configured_project_with_ontology) -def test_all_validate_json(configured_project_without_data_rows, predictions): +def test_all_validate_json(configured_project_with_ontology, predictions): #Predictions contains one of each type of prediction. #These should be properly formatted and pass. - _validate_ndjson(predictions, configured_project_without_data_rows) + _validate_ndjson(predictions, configured_project_with_ontology) -def test_incorrect_line(line_inference, configured_project_without_data_rows): +def test_incorrect_line(line_inference, configured_project_with_ontology): line = line_inference.copy() line["line"] = [line["line"][0]] #Just one point with pytest.raises(MALValidationError): - _validate_ndjson([line], configured_project_without_data_rows) + _validate_ndjson([line], configured_project_with_ontology) def test_incorrect_rectangle(rectangle_inference, - configured_project_without_data_rows): + configured_project_with_ontology): del rectangle_inference['bbox']['top'] with pytest.raises(MALValidationError): _validate_ndjson([rectangle_inference], - configured_project_without_data_rows) + configured_project_with_ontology) -def test_duplicate_tools(rectangle_inference, - configured_project_without_data_rows): +def test_duplicate_tools(rectangle_inference, configured_project_with_ontology): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_invalid_feature_schema(configured_project_without_data_rows, +def test_invalid_feature_schema(configured_project_with_ontology, rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() pred['schemaId'] = "blahblah" with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_name_only_feature_schema(configured_project_without_data_rows, +def test_name_only_feature_schema(configured_project_with_ontology, rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() del pred['schemaId'] - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_schema_id_only_feature_schema(configured_project_without_data_rows, +def test_schema_id_only_feature_schema(configured_project_with_ontology, rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() del pred['name'] - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_missing_feature_schema(configured_project_without_data_rows, +def test_missing_feature_schema(configured_project_with_ontology, rectangle_inference): #Trying to upload a polygon and rectangle at the same time pred = rectangle_inference.copy() del pred['schemaId'] del pred['name'] with pytest.raises(MALValidationError): - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) -def test_validate_ndjson(tmp_path, configured_project_without_data_rows): +def test_validate_ndjson(tmp_path, configured_project_with_ontology): file_name = f"broken.ndjson" file_path = tmp_path / file_name with file_path.open("w") as f: f.write("test") with pytest.raises(ValueError): - configured_project_without_data_rows.upload_annotations( + configured_project_with_ontology.upload_annotations( name="name", annotations=str(file_path), validate=True) -def test_validate_ndjson_uuid(tmp_path, configured_project_without_data_rows, +def test_validate_ndjson_uuid(tmp_path, configured_project_with_ontology, predictions): file_name = f"repeat_uuid.ndjson" file_path = tmp_path / file_name @@ -257,15 +262,15 @@ def test_validate_ndjson_uuid(tmp_path, configured_project_without_data_rows, parser.dump(repeat_uuid, f) with pytest.raises(MALValidationError): - configured_project_without_data_rows.upload_annotations( + configured_project_with_ontology.upload_annotations( name="name", validate=True, annotations=str(file_path)) with pytest.raises(MALValidationError): - configured_project_without_data_rows.upload_annotations( + configured_project_with_ontology.upload_annotations( name="name", validate=True, annotations=repeat_uuid) def test_video_upload(video_checklist_inference, - configured_project_without_data_rows): + configured_project_with_ontology): pred = video_checklist_inference.copy() - _validate_ndjson([pred], configured_project_without_data_rows) + _validate_ndjson([pred], configured_project_with_ontology) diff --git a/tests/integration/annotation_import/test_upsert_prediction_import.py b/tests/integration/annotation_import/test_upsert_prediction_import.py index 9ed045f5e..927b6526d 100644 --- a/tests/integration/annotation_import/test_upsert_prediction_import.py +++ b/tests/integration/annotation_import/test_upsert_prediction_import.py @@ -13,7 +13,7 @@ @pytest.mark.skip() def test_create_from_url(client, tmp_path, object_predictions, model_run_with_data_rows, - configured_project_without_data_rows, + configured_project_with_one_data_row, annotation_import_test_helpers): name = str(uuid.uuid4()) file_name = f"{name}.json" @@ -41,7 +41,7 @@ def test_create_from_url(client, tmp_path, object_predictions, annotation_import, batch, mal_prediction_import = model_run_with_data_rows.upsert_predictions_and_send_to_project( name=name, predictions=url, - project_id=configured_project_without_data_rows.uid, + project_id=configured_project_with_one_data_row.uid, priority=5) assert annotation_import.model_run_id == model_run_with_data_rows.uid @@ -50,7 +50,7 @@ def test_create_from_url(client, tmp_path, object_predictions, assert annotation_import.statuses assert batch - assert batch.project().uid == configured_project_without_data_rows.uid + assert batch.project().uid == configured_project_with_one_data_row.uid assert mal_prediction_import mal_prediction_import.wait_until_done() @@ -61,7 +61,7 @@ def test_create_from_url(client, tmp_path, object_predictions, @pytest.mark.skip() def test_create_from_objects(model_run_with_data_rows, - configured_project_without_data_rows, + configured_project_with_one_data_row, object_predictions, annotation_import_test_helpers): name = str(uuid.uuid4()) @@ -76,7 +76,7 @@ def test_create_from_objects(model_run_with_data_rows, annotation_import, batch, mal_prediction_import = model_run_with_data_rows.upsert_predictions_and_send_to_project( name=name, predictions=predictions, - project_id=configured_project_without_data_rows.uid, + project_id=configured_project_with_one_data_row.uid, priority=5) assert annotation_import.model_run_id == model_run_with_data_rows.uid @@ -85,7 +85,7 @@ def test_create_from_objects(model_run_with_data_rows, assert annotation_import.statuses assert batch - assert batch.project().uid == configured_project_without_data_rows.uid + assert batch.project().uid == configured_project_with_one_data_row.uid assert mal_prediction_import mal_prediction_import.wait_until_done() @@ -96,7 +96,7 @@ def test_create_from_objects(model_run_with_data_rows, @pytest.mark.skip() def test_create_from_local_file(tmp_path, model_run_with_data_rows, - configured_project_without_data_rows, + configured_project_with_one_data_row, object_predictions, annotation_import_test_helpers): @@ -119,7 +119,7 @@ def test_create_from_local_file(tmp_path, model_run_with_data_rows, annotation_import, batch, mal_prediction_import = model_run_with_data_rows.upsert_predictions_and_send_to_project( name=name, predictions=str(file_path), - project_id=configured_project_without_data_rows.uid, + project_id=configured_project_with_one_data_row.uid, priority=5) assert annotation_import.model_run_id == model_run_with_data_rows.uid @@ -128,7 +128,7 @@ def test_create_from_local_file(tmp_path, model_run_with_data_rows, assert annotation_import.statuses assert batch - assert batch.project().uid == configured_project_without_data_rows.uid + assert batch.project().uid == configured_project_with_one_data_row.uid assert mal_prediction_import mal_prediction_import.wait_until_done() diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 82c739ddb..56ce6bae1 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -21,6 +21,7 @@ from labelbox.schema.annotation_import import LabelImport from labelbox.schema.enums import AnnotationImportState from labelbox.schema.invite import Invite +from labelbox.schema.project import Project from labelbox.schema.queue_mode import QueueMode from labelbox.schema.user import User @@ -425,16 +426,21 @@ def configured_project_with_label(client, rand_gen, image_url, project, dataset, Additionally includes a create_label method for any needed extra labels One label is already created and yielded when using fixture """ + start_time = time.time() + project._wait_until_data_rows_are_processed(data_row_ids=[data_row.uid], + sleep_interval=3) project.create_batch( rand_gen(str), [data_row.uid], # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) ) + print("create_batch took: ", time.time() - start_time) ontology = _setup_ontology(project) + print("setup ontology took: ", time.time() - start_time) label = _create_label(project, data_row, ontology, wait_for_label_processing) - + print("create_label took: ", time.time() - start_time) yield [project, dataset, data_row, label] for label in project.labels(): @@ -817,11 +823,13 @@ def upload_invalid_data_rows_for_dataset(dataset: Dataset): task.wait_till_done() +@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ) def pytest_configure(): pytest.report = defaultdict(int) pytest.data_row_report = {'times': 0, 'num_rows': 0} +@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ) @pytest.hookimpl(hookwrapper=True) def pytest_fixture_setup(fixturedef, request): start = time.time() @@ -832,11 +840,8 @@ def pytest_fixture_setup(fixturedef, request): exec_time = end - start pytest.report[fixturedef.argname] += exec_time - # print('pytest_fixture_setup' - # f', request={request}' - # f', create_data_row_time={end - start}') - +@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ) @pytest.fixture(scope='session', autouse=True) def print_perf_summary(): yield diff --git a/tests/integration/export_v2/conftest.py b/tests/integration/export_v2/conftest.py index af8b4c66f..757bba44e 100644 --- a/tests/integration/export_v2/conftest.py +++ b/tests/integration/export_v2/conftest.py @@ -297,7 +297,7 @@ def configured_project_with_ontology(client, initial_dataset, ontology, @pytest.fixture -def configured_project_without_data_rows(client, ontology, rand_gen): +def configured_project_with_one_data_row(client, ontology, rand_gen): project = client.create_project(name=rand_gen(str), description=rand_gen(str), queue_mode=QueueMode.Batch) diff --git a/tests/integration/export_v2/test_export_video.py b/tests/integration/export_v2/test_export_video.py index 863f4d31e..94828e1b6 100644 --- a/tests/integration/export_v2/test_export_video.py +++ b/tests/integration/export_v2/test_export_video.py @@ -5,11 +5,11 @@ from labelbox.schema.annotation_import import AnnotationImportState -def test_export_v2_video(client, configured_project_without_data_rows, +def test_export_v2_video(client, configured_project_with_one_data_row, video_data, video_data_row, bbox_video_annotation_objects, rand_gen): - project = configured_project_without_data_rows + project = configured_project_with_one_data_row project_id = project.uid labels = [] _, data_row_uids = video_data From 65990878d68bb1c35a3c9e6f6dff27c83ba1ea40 Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Fri, 18 Aug 2023 10:58:02 -0700 Subject: [PATCH 13/20] Remove more sources of data leakage --- .../annotation_import/test_data_types.py | 19 ++++++++++++++----- tests/integration/conftest.py | 2 ++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py index 5de79f5cc..a5c27eb20 100644 --- a/tests/integration/annotation_import/test_data_types.py +++ b/tests/integration/annotation_import/test_data_types.py @@ -137,7 +137,8 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name): ]) def test_import_data_types(client, configured_project, initial_dataset, rand_gen, data_row_json_by_data_type, - annotations_by_data_type, data_type_class): + annotations_by_data_type, data_type_class, + one_datarow): project = configured_project project_id = project.uid @@ -304,14 +305,22 @@ def test_import_label_annotations(client, configured_project_with_one_data_row, @pytest.mark.parametrize('data_type, data_class, annotations', test_params) -def test_import_mal_annotations(client, configured_project_with_one_data_row, - data_row_json_by_data_type, data_type, - data_class, annotations, rand_gen): - +@pytest.fixture +def one_datarow(client, rand_gen, data_row_json_by_data_type, data_type): dataset = client.create_dataset(name=rand_gen(str)) data_row_json = data_row_json_by_data_type[data_type] data_row = dataset.create_data_row(data_row_json) + yield data_row + + dataset.delete() + + +@pytest.mark.parametrize('data_type, data_class, annotations', test_params) +def test_import_mal_annotations(client, configured_project_with_one_data_row, + data_type, data_class, annotations, rand_gen, + one_datarow): + data_row = one_datarow set_project_media_type_from_data_type(configured_project_with_one_data_row, data_class) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 56ce6bae1..af66a7ed4 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -390,6 +390,8 @@ def initial_dataset(client, rand_gen): dataset = client.create_dataset(name=rand_gen(str)) yield dataset + dataset.delete() + @pytest.fixture def project_with_ontology(project): From 9e41e82169da875ec0d9aee9638fa6fbca864b60 Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Fri, 18 Aug 2023 11:31:08 -0700 Subject: [PATCH 14/20] Add config for fixture profiling --- Makefile | 1 + .../integration/annotation_import/conftest.py | 11 +----- .../annotation_import/test_data_types.py | 13 +++++-- tests/integration/conftest.py | 37 +++++++------------ 4 files changed, 25 insertions(+), 37 deletions(-) diff --git a/Makefile b/Makefile index f9f490554..b7838a7d4 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ test-local: build-image -e LABELBOX_TEST_ENVIRON="local" \ -e DA_GCP_LABELBOX_API_KEY=${DA_GCP_LABELBOX_API_KEY} \ -e LABELBOX_TEST_API_KEY_LOCAL=${LABELBOX_TEST_API_KEY_LOCAL} \ + -e FIXTURE_PROFILE=true \ local/labelbox-python:test pytest $(PATH_TO_TEST) test-staging: build-image diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index 3f1cd7de5..988ad7883 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -498,7 +498,6 @@ def get_data_row_id(indx=0): @pytest.fixture def configured_project(client, initial_dataset, ontology, rand_gen, image_url): - start_time = time.time() dataset = initial_dataset project = client.create_project(name=rand_gen(str), queue_mode=QueueMode.Batch) @@ -515,15 +514,12 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url): num_rows += 1 project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids, sleep_interval=3) - if pytest.data_row_report: - pytest.data_row_report['times'] += time.time() - start_time - pytest.data_row_report['num_rows'] += num_rows + project.create_batch( rand_gen(str), data_row_ids, # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) ) - print("After creating batch ", time.time() - start_time) project.data_row_ids = data_row_ids yield project @@ -579,8 +575,6 @@ def dataset_conversation_entity(client, rand_gen, conversation_entity_data_row, @pytest.fixture def configured_project_with_one_data_row(client, ontology, rand_gen, initial_dataset, image_url): - start_time = time.time() - project = client.create_project(name=rand_gen(str), description=rand_gen(str), queue_mode=QueueMode.Batch) @@ -594,9 +588,6 @@ def configured_project_with_one_data_row(client, ontology, rand_gen, project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids, sleep_interval=3) - if pytest.data_row_report: - pytest.data_row_report['times'] += time.time() - start_time - pytest.data_row_report['num_rows'] += 1 batch = project.create_batch( rand_gen(str), data_row_ids, # sample of data row objects diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py index a5c27eb20..79e8b03cb 100644 --- a/tests/integration/annotation_import/test_data_types.py +++ b/tests/integration/annotation_import/test_data_types.py @@ -135,10 +135,15 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name): AudioData, ConversationData, DicomData, DocumentData, HTMLData, ImageData, TextData ]) -def test_import_data_types(client, configured_project, initial_dataset, - rand_gen, data_row_json_by_data_type, - annotations_by_data_type, data_type_class, - one_datarow): +def test_import_data_types( + client, + configured_project, + initial_dataset, + rand_gen, + data_row_json_by_data_type, + annotations_by_data_type, + data_type_class, +): project = configured_project project_id = project.uid diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index af66a7ed4..a6651b97d 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -27,8 +27,6 @@ IMG_URL = "https://picsum.photos/200/300.jpg" SMALL_DATASET_URL = "https://storage.googleapis.com/lb-artifacts-testing-public/sdk_integration_test/potato.jpeg" -DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS = 30 -DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS = 5 class Environ(Enum): @@ -458,10 +456,8 @@ def configured_batch_project_with_label(project, dataset, data_row, One label is already created and yielded when using fixture """ data_rows = [dr.uid for dr in list(dataset.data_rows())] - project._wait_until_data_rows_are_processed( - data_row_ids=data_rows, - wait_processing_max_seconds=DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS, - sleep_interval=DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS) + project._wait_until_data_rows_are_processed(data_row_ids=data_rows, + sleep_interval=3) project.create_batch("test-batch", data_rows) project.data_row_ids = data_rows @@ -604,7 +600,6 @@ def configured_project_with_complex_ontology(client, initial_dataset, rand_gen, project.setup(editor, ontology.asdict()) yield [project, data_row] - dataset.delete() project.delete() @@ -825,35 +820,31 @@ def upload_invalid_data_rows_for_dataset(dataset: Dataset): task.wait_till_done() -@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ) def pytest_configure(): pytest.report = defaultdict(int) - pytest.data_row_report = {'times': 0, 'num_rows': 0} -@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ) @pytest.hookimpl(hookwrapper=True) -def pytest_fixture_setup(fixturedef, request): +def pytest_fixture_setup(fixturedef): start = time.time() yield - end = time.time() exec_time = end - start - pytest.report[fixturedef.argname] += exec_time + if "FIXTURE_PROFILE" in os.environ: + pytest.report[fixturedef.argname] += exec_time -@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ) @pytest.fixture(scope='session', autouse=True) def print_perf_summary(): yield - sorted_dict = dict( - sorted(pytest.report.items(), key=lambda item: item[1], reverse=True)) - num_of_entries = 10 if len(sorted_dict) >= 10 else len(sorted_dict) - slowest_fixtures = [ - (aaa, sorted_dict[aaa]) for aaa in islice(sorted_dict, num_of_entries) - ] - print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr) - print("Data row report:\n", pytest.data_row_report, file=sys.stderr) - # assert False + if "FIXTURE_PROFILE" in os.environ: + sorted_dict = dict( + sorted(pytest.report.items(), + key=lambda item: item[1], + reverse=True)) + num_of_entries = 10 if len(sorted_dict) >= 10 else len(sorted_dict) + slowest_fixtures = [(aaa, sorted_dict[aaa]) + for aaa in islice(sorted_dict, num_of_entries)] + print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr) From ba2990d68087b575a7f27ed6e366fab1bb7caf53 Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Fri, 18 Aug 2023 15:02:14 -0700 Subject: [PATCH 15/20] Add explanation on how to supply data row ids to prediction_id_mapping --- .../integration/annotation_import/conftest.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index 988ad7883..1980d6f26 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -505,13 +505,11 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url): client.get_labeling_frontends( where=LabelingFrontend.name == "editor"))[0] project.setup(editor, ontology) - num_rows = 0 data_row_ids = [] for _ in range(len(ontology['tools']) + len(ontology['classifications'])): data_row_ids.append(dataset.create_data_row(row_data=image_url).uid) - num_rows += 1 project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids, sleep_interval=3) @@ -605,6 +603,22 @@ def configured_project_with_one_data_row(client, ontology, rand_gen, # At the moment it expects only one feature per tool type and this creates unnecessary coupling between differet tests # In an example of a 'rectangle' we have extended to support multiple instances of the same tool type # TODO: we will support this approach in the future for all tools +# +""" +Please note that this fixture now offers the flexibility to configure three different strategies for generating data row ids for predictions: +Default(configured_project fixture): + configured_project that generates a data row for each member of ontology. + This makes sure each prediction has its own data row id. This is applicable to prediction upload cases when last label overwrites existing ones + +Optimized Strategy (configured_project_with_one_data_row fixture): + This fixture has only one data row and all predictions will be mapped to it + +Custom Data Row IDs Strategy: + Individuals can create their own fixture to supply data row ids. + This particular fixture, termed "hardcoded_datarow_id," should be defined locally within a test file. +""" + + @pytest.fixture def prediction_id_mapping(ontology, request): # Maps tool types to feature schema ids From 700fefe87cb2259ea78b8994f986b641ce954cd7 Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Fri, 18 Aug 2023 15:50:42 -0700 Subject: [PATCH 16/20] Fix test_user_and_org.py --- tests/integration/test_user_and_org.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_user_and_org.py b/tests/integration/test_user_and_org.py index 9f07666de..ca158527c 100644 --- a/tests/integration/test_user_and_org.py +++ b/tests/integration/test_user_and_org.py @@ -1,3 +1,6 @@ +from labelbox.schema.project import Project + + def test_user(client): user = client.get_user() assert user.uid is not None @@ -10,14 +13,11 @@ def test_organization(client): assert client.get_user() in set(organization.users()) -def test_user_and_org_projects(project): - client = project.client +def test_user_and_org_projects(client, project): user = client.get_user() org = client.get_organization() - user_projects = set(user.projects()) - org_projects = set(org.projects()) + user_project = user.projects(where=Project.uid == project.uid) + org_project = org.projects(where=Project.uid == project.uid) - assert project.created_by() == user - assert project.organization() == org - assert project in user_projects - assert project in org_projects \ No newline at end of file + assert user_project + assert org_project \ No newline at end of file From 5a6e250fba457336a78bd249c1cc22efc1973842 Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Tue, 22 Aug 2023 17:18:27 -0700 Subject: [PATCH 17/20] PR updates --- .../integration/annotation_import/conftest.py | 8 +++++-- .../test_bulk_import_request.py | 8 +++---- tests/integration/conftest.py | 22 ++++++++++--------- tests/integration/export_v2/conftest.py | 2 +- .../export_v2/test_export_video.py | 4 ++-- tests/integration/test_filtering.py | 4 ++-- tests/integration/test_project.py | 6 ++--- 7 files changed, 30 insertions(+), 24 deletions(-) diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index 1980d6f26..ebfe74f47 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -513,7 +513,7 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url): project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids, sleep_interval=3) - project.create_batch( + batch = project.create_batch( rand_gen(str), data_row_ids, # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) @@ -521,6 +521,8 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url): project.data_row_ids = data_row_ids yield project + + batch.delete() project.delete() @@ -614,8 +616,10 @@ def configured_project_with_one_data_row(client, ontology, rand_gen, This fixture has only one data row and all predictions will be mapped to it Custom Data Row IDs Strategy: - Individuals can create their own fixture to supply data row ids. + Individuals can supply hard-coded data row ids when a creation of data row is not required. This particular fixture, termed "hardcoded_datarow_id," should be defined locally within a test file. + In the future, we can use this approach to inject correct number of rows instead of using configured_project fixture + that creates a data row for each member of ontology (14 in total) for each run. """ diff --git a/tests/integration/annotation_import/test_bulk_import_request.py b/tests/integration/annotation_import/test_bulk_import_request.py index 6691cc044..52552f53d 100644 --- a/tests/integration/annotation_import/test_bulk_import_request.py +++ b/tests/integration/annotation_import/test_bulk_import_request.py @@ -41,13 +41,13 @@ def test_create_from_url(project): assert bulk_import_request.state == BulkImportRequestState.RUNNING -def test_validate_file(project_with_ontology): +def test_validate_file(project_with_empty_ontology): name = str(uuid.uuid4()) url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson" with pytest.raises(MALValidationError): - project_with_ontology.upload_annotations(name=name, - annotations=url, - validate=True) + project_with_empty_ontology.upload_annotations(name=name, + annotations=url, + validate=True) #Schema ids shouldn't match diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index a6651b97d..781fe6edb 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -27,6 +27,8 @@ IMG_URL = "https://picsum.photos/200/300.jpg" SMALL_DATASET_URL = "https://storage.googleapis.com/lb-artifacts-testing-public/sdk_integration_test/potato.jpeg" +DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS = 30 +DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS = 3 class Environ(Enum): @@ -392,7 +394,7 @@ def initial_dataset(client, rand_gen): @pytest.fixture -def project_with_ontology(project): +def project_with_empty_ontology(project): editor = list( project.client.get_labeling_frontends( where=LabelingFrontend.name == "editor"))[0] @@ -402,13 +404,13 @@ def project_with_ontology(project): @pytest.fixture -def configured_project(project_with_ontology, initial_dataset, rand_gen, +def configured_project(project_with_empty_ontology, initial_dataset, rand_gen, image_url): dataset = initial_dataset data_row_id = dataset.create_data_row(row_data=image_url).uid - project = project_with_ontology + project = project_with_empty_ontology - project.create_batch( + batch = project.create_batch( rand_gen(str), [data_row_id], # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) @@ -417,6 +419,8 @@ def configured_project(project_with_ontology, initial_dataset, rand_gen, yield project + batch.delete() + @pytest.fixture def configured_project_with_label(client, rand_gen, image_url, project, dataset, @@ -426,21 +430,19 @@ def configured_project_with_label(client, rand_gen, image_url, project, dataset, Additionally includes a create_label method for any needed extra labels One label is already created and yielded when using fixture """ - start_time = time.time() - project._wait_until_data_rows_are_processed(data_row_ids=[data_row.uid], - sleep_interval=3) + project._wait_until_data_rows_are_processed( + data_row_ids=[data_row.uid], + wait_processing_max_seconds=DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS, + sleep_interval=DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS) project.create_batch( rand_gen(str), [data_row.uid], # sample of data row objects 5 # priority between 1(Highest) - 5(lowest) ) - print("create_batch took: ", time.time() - start_time) ontology = _setup_ontology(project) - print("setup ontology took: ", time.time() - start_time) label = _create_label(project, data_row, ontology, wait_for_label_processing) - print("create_label took: ", time.time() - start_time) yield [project, dataset, data_row, label] for label in project.labels(): diff --git a/tests/integration/export_v2/conftest.py b/tests/integration/export_v2/conftest.py index 757bba44e..af8b4c66f 100644 --- a/tests/integration/export_v2/conftest.py +++ b/tests/integration/export_v2/conftest.py @@ -297,7 +297,7 @@ def configured_project_with_ontology(client, initial_dataset, ontology, @pytest.fixture -def configured_project_with_one_data_row(client, ontology, rand_gen): +def configured_project_without_data_rows(client, ontology, rand_gen): project = client.create_project(name=rand_gen(str), description=rand_gen(str), queue_mode=QueueMode.Batch) diff --git a/tests/integration/export_v2/test_export_video.py b/tests/integration/export_v2/test_export_video.py index 94828e1b6..863f4d31e 100644 --- a/tests/integration/export_v2/test_export_video.py +++ b/tests/integration/export_v2/test_export_video.py @@ -5,11 +5,11 @@ from labelbox.schema.annotation_import import AnnotationImportState -def test_export_v2_video(client, configured_project_with_one_data_row, +def test_export_v2_video(client, configured_project_without_data_rows, video_data, video_data_row, bbox_video_annotation_objects, rand_gen): - project = configured_project_with_one_data_row + project = configured_project_without_data_rows project_id = project.uid labels = [] _, data_row_uids = video_data diff --git a/tests/integration/test_filtering.py b/tests/integration/test_filtering.py index fde7f0638..f44cdcdcb 100644 --- a/tests/integration/test_filtering.py +++ b/tests/integration/test_filtering.py @@ -15,7 +15,7 @@ def project_to_test_where(client, rand_gen): p_b = client.create_project(name=p_b_name, queue_mode=QueueMode.Batch) p_c = client.create_project(name=p_c_name, queue_mode=QueueMode.Batch) - yield p_a, p_b, p_c + yield p_a, p_b p_a.delete() p_b.delete() @@ -26,7 +26,7 @@ def project_to_test_where(client, rand_gen): # other builds simultaneously adding projects to test org def test_where(client, project_to_test_where): p_a, p_b, p_c = project_to_test_where - p_a_name, p_b_name, _ = [p.name for p in [p_a, p_b, p_c]] + p_a_name, p_b_name = [p.name for p in [p_a, p_b]] def get(where=None): date_where = Project.created_at >= p_a.created_at diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py index b3b683a3d..94c98ee50 100644 --- a/tests/integration/test_project.py +++ b/tests/integration/test_project.py @@ -171,15 +171,15 @@ def test_attach_instructions(client, project): @pytest.mark.skipif(condition=os.environ['LABELBOX_TEST_ENVIRON'] == "onprem", reason="new mutation does not work for onprem") -def test_html_instructions(project_with_ontology): +def test_html_instructions(project_with_empty_ontology): html_file_path = '/tmp/instructions.html' sample_html_str = "" with open(html_file_path, 'w') as file: file.write(sample_html_str) - project_with_ontology.upsert_instructions(html_file_path) - updated_ontology = project_with_ontology.ontology().normalized + project_with_empty_ontology.upsert_instructions(html_file_path) + updated_ontology = project_with_empty_ontology.ontology().normalized instructions = updated_ontology.pop('projectInstructions') assert requests.get(instructions).text == sample_html_str From e585e8c14c89503a99319c342f80903bc5eeeaec Mon Sep 17 00:00:00 2001 From: Val Brodsky Date: Tue, 22 Aug 2023 17:35:52 -0700 Subject: [PATCH 18/20] Turn on fixture profile for staging --- .github/workflows/python-package.yml | 1 + tests/integration/annotation_import/conftest.py | 1 - tests/integration/test_filtering.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 6ed378f09..83c0393af 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -39,6 +39,7 @@ jobs: echo "LABELBOX_TEST_ENVIRON=prod" >> $GITHUB_ENV else echo "LABELBOX_TEST_ENVIRON=staging" >> $GITHUB_ENV + echo "FIXTURE_PROFILE=true" >> $GITHUB_ENV fi - uses: actions/checkout@v2 diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py index ebfe74f47..d50c44d0c 100644 --- a/tests/integration/annotation_import/conftest.py +++ b/tests/integration/annotation_import/conftest.py @@ -522,7 +522,6 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url): yield project - batch.delete() project.delete() diff --git a/tests/integration/test_filtering.py b/tests/integration/test_filtering.py index f44cdcdcb..7dd687759 100644 --- a/tests/integration/test_filtering.py +++ b/tests/integration/test_filtering.py @@ -15,7 +15,7 @@ def project_to_test_where(client, rand_gen): p_b = client.create_project(name=p_b_name, queue_mode=QueueMode.Batch) p_c = client.create_project(name=p_c_name, queue_mode=QueueMode.Batch) - yield p_a, p_b + yield p_a, p_b, p_c p_a.delete() p_b.delete() From becff6637314e214630facbae15c29c7d89e14e1 Mon Sep 17 00:00:00 2001 From: Richard Sun Date: Wed, 23 Aug 2023 10:46:32 -0700 Subject: [PATCH 19/20] [QQC-2355] Limit number of data rows to check for processing status at once (#1218) --- labelbox/schema/project.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index addb8c10b..bd912aa0b 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -1454,12 +1454,35 @@ def _wait_until_data_rows_are_processed( """ Wait until all the specified data rows are processed""" start_time = datetime.now() + max_data_rows_per_poll = 100_000 + if data_row_ids is not None: + for i in range(0, len(data_row_ids), max_data_rows_per_poll): + chunk = data_row_ids[i:i + max_data_rows_per_poll] + self._poll_data_row_processing_status( + chunk, [], start_time, wait_processing_max_seconds, + sleep_interval) + + if global_keys is not None: + for i in range(0, len(global_keys), max_data_rows_per_poll): + chunk = global_keys[i:i + max_data_rows_per_poll] + self._poll_data_row_processing_status( + [], chunk, start_time, wait_processing_max_seconds, + sleep_interval) + + def _poll_data_row_processing_status( + self, + data_row_ids: List[str], + global_keys: List[str], + start_time: datetime, + wait_processing_max_seconds: int = _wait_processing_max_seconds, + sleep_interval=30): + while True: if (datetime.now() - start_time).total_seconds() >= wait_processing_max_seconds: raise ProcessingWaitTimeout( - "Maximum wait time exceeded while waiting for data rows to be processed. Try creating a batch a bit later" - ) + """Maximum wait time exceeded while waiting for data rows to be processed. + Try creating a batch a bit later""") all_good = self.__check_data_rows_have_been_processed( data_row_ids, global_keys) From c4bdab4b005a9f5ecd48c956ff9884e5a8577121 Mon Sep 17 00:00:00 2001 From: Klaus Opreschko Date: Thu, 24 Aug 2023 10:14:34 -0600 Subject: [PATCH 20/20] prep for release --- CHANGELOG.md | 4 ++++ docs/source/conf.py | 2 +- labelbox/__init__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49969ccc9..c28fb8f1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,8 @@ # Changelog +# Version 3.52.0 (2023-08-24) +## Added +* Added methods to create multiple batches for a project from a list of data rows +* Limit the number of data rows to be checked for processing status # Version 3.51.0 (2023-08-14) ## Added * Added global keys to export v2 filters for project, dataset and DataRow diff --git a/docs/source/conf.py b/docs/source/conf.py index af3413148..dd6ee3a1e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,7 @@ copyright = '2021, Labelbox' author = 'Labelbox' -release = '3.51.0' +release = '3.52.0' # -- General configuration --------------------------------------------------- diff --git a/labelbox/__init__.py b/labelbox/__init__.py index 1dba57cf5..7a9efc5e6 100644 --- a/labelbox/__init__.py +++ b/labelbox/__init__.py @@ -1,5 +1,5 @@ name = "labelbox" -__version__ = "3.51.0" +__version__ = "3.52.0" from labelbox.client import Client from labelbox.schema.project import Project