From 263174aa07de8441ee4f3390363168984555d11f Mon Sep 17 00:00:00 2001
From: Attila Papai <97034214+attila-papai@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:18:38 +0200
Subject: [PATCH 01/20] [AL-6743] skip invalid media type validation when ADV
 is enabled (#1210)

---
 tests/integration/test_data_rows.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py
index 870b0cf41..8248304b3 100644
--- a/tests/integration/test_data_rows.py
+++ b/tests/integration/test_data_rows.py
@@ -980,7 +980,7 @@ def test_create_conversational_text(dataset, conversational_content):
             data_row.row_data).json() == conversational_content['row_data']
 
 
-def test_invalid_media_type(dataset, conversational_content):
+def test_invalid_media_type(dataset, conversational_content, is_adv_enabled):
     for error_message, invalid_media_type in [[
             "Found invalid contents for media type: 'IMAGE'", 'IMAGE'
     ], ["Found invalid media type: 'totallyinvalid'", 'totallyinvalid']]:
@@ -988,9 +988,13 @@ def test_invalid_media_type(dataset, conversational_content):
         # using malformed query. But for invalid contents in FileUploads we use InvalidQueryError
         with pytest.raises(labelbox.exceptions.InvalidQueryError):
             dataset.create_data_rows_sync([{
-                **conversational_content, 'media_type': invalid_media_type
+                **conversational_content, 'media_type': 'IMAGE'
             }])
 
+        if is_adv_enabled:
+            # ADV does not take media type hint into account for async import requests
+            continue
+
         task = dataset.create_data_rows([{
             **conversational_content, 'media_type': invalid_media_type
         }])

From 454950a51ffebd9a2595fe4d5b4992e3cd9a4ce5 Mon Sep 17 00:00:00 2001
From: Richard Sun <richardsun0713@users.noreply.github.com>
Date: Thu, 17 Aug 2023 11:18:09 -0700
Subject: [PATCH 02/20] [QQC-2355] Provide methods to create multiple batches
 (#1197)

---
 labelbox/client.py                     |  24 ++++
 labelbox/schema/create_batches_task.py |  67 +++++++++
 labelbox/schema/project.py             | 190 +++++++++++++++++++++----
 tests/integration/conftest.py          |  45 +++++-
 tests/integration/test_batch.py        |  55 ++-----
 tests/integration/test_batches.py      |  36 +++++
 6 files changed, 344 insertions(+), 73 deletions(-)
 create mode 100644 labelbox/schema/create_batches_task.py
 create mode 100644 tests/integration/test_batches.py

diff --git a/labelbox/client.py b/labelbox/client.py
index ce1ebe33c..c28f3fb78 100644
--- a/labelbox/client.py
+++ b/labelbox/client.py
@@ -1704,3 +1704,27 @@ def unarchive_feature_schema_node(self, ontology_id: str,
             raise labelbox.exceptions.LabelboxError(
                 "Failed unarchive the feature schema node, message: ",
                 response.text)
+
+    def get_batch(self, project_id: str, batch_id: str) -> Entity.Batch:
+        # obtain batch entity to return
+        get_batch_str = """query %s($projectId: ID!, $batchId: ID!) {
+                          project(where: {id: $projectId}) {
+                             batches(where: {id: $batchId}) {
+                                nodes {
+                                   %s
+                                }
+                             }
+                        }
+                    }
+                    """ % ("getProjectBatchPyApi",
+                           query.results_query_part(Entity.Batch))
+
+        batch = self.execute(
+            get_batch_str, {
+                "projectId": project_id,
+                "batchId": batch_id
+            },
+            timeout=180.0,
+            experimental=True)["project"]["batches"]["nodes"][0]
+
+        return Entity.Batch(self, project_id, batch)
diff --git a/labelbox/schema/create_batches_task.py b/labelbox/schema/create_batches_task.py
new file mode 100644
index 000000000..919d30204
--- /dev/null
+++ b/labelbox/schema/create_batches_task.py
@@ -0,0 +1,67 @@
+import json
+from typing import TYPE_CHECKING, Callable, List, Optional, Dict, Any
+
+from labelbox.orm.model import Entity
+
+if TYPE_CHECKING:
+    from labelbox import User
+
+    def lru_cache() -> Callable[..., Callable[..., Dict[str, Any]]]:
+        pass
+else:
+    from functools import lru_cache
+
+
+class CreateBatchesTask:
+
+    def __init__(self, client, project_id: str, batch_ids: List[str],
+                 task_ids: List[str]):
+        self.client = client
+        self.project_id = project_id
+        self.batches = batch_ids
+        self.tasks = [
+            Entity.Task.get_task(self.client, task_id) for task_id in task_ids
+        ]
+
+    def wait_till_done(self, timeout_seconds: int = 300) -> None:
+        """
+        Waits for the task to complete.
+
+        Args:
+            timeout_seconds: the number of seconds to wait before timing out
+
+        Returns: None
+        """
+
+        for task in self.tasks:
+            task.wait_till_done(timeout_seconds)
+
+    def errors(self) -> Optional[Dict[str, Any]]:
+        """
+        Returns the errors from the task, if any.
+
+        Returns: a dictionary of errors, keyed by task id
+        """
+
+        errors = {}
+        for task in self.tasks:
+            if task.status == "FAILED":
+                errors[task.uid] = json.loads(task.result_url)
+
+        if len(errors) == 0:
+            return None
+
+        return errors
+
+    @lru_cache()
+    def result(self):
+        """
+        Returns the batches created by the task.
+
+        Returns: the list of batches created by the task
+        """
+
+        return [
+            self.client.get_batch(self.project_id, batch_id)
+            for batch_id in self.batches
+        ]
diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py
index b5f4fdb8d..addb8c10b 100644
--- a/labelbox/schema/project.py
+++ b/labelbox/schema/project.py
@@ -7,9 +7,9 @@
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
 from urllib.parse import urlparse
 
-from labelbox import parser
 import requests
 
+from labelbox import parser
 from labelbox import utils
 from labelbox.exceptions import (InvalidQueryError, LabelboxError,
                                  ProcessingWaitTimeout, ResourceConflict,
@@ -19,6 +19,7 @@
 from labelbox.orm.model import Entity, Field, Relationship
 from labelbox.pagination import PaginatedCollection
 from labelbox.schema.consensus_settings import ConsensusSettings
+from labelbox.schema.create_batches_task import CreateBatchesTask
 from labelbox.schema.data_row import DataRow
 from labelbox.schema.export_filters import ProjectExportFilters, validate_datetime, build_filters
 from labelbox.schema.export_params import ProjectExportParams
@@ -26,8 +27,8 @@
 from labelbox.schema.queue_mode import QueueMode
 from labelbox.schema.resource_tag import ResourceTag
 from labelbox.schema.task import Task
-from labelbox.schema.user import User
 from labelbox.schema.task_queue import TaskQueue
+from labelbox.schema.user import User
 
 if TYPE_CHECKING:
     from labelbox import BulkImportRequest
@@ -721,14 +722,19 @@ def create_batch(
         consensus_settings: Optional[Dict[str, float]] = None,
         global_keys: Optional[List[str]] = None,
     ):
-        """Create a new batch for a project. One of `global_keys` or `data_rows` must be provided but not both.
+        """
+        Creates a new batch for a project. One of `global_keys` or `data_rows` must be provided, but not both. A
+            maximum of 100,000 data rows can be added to a batch.
 
         Args:
             name: a name for the batch, must be unique within a project
             data_rows: Either a list of `DataRows` or Data Row ids. 
             global_keys: global keys for data rows to add to the batch. 
             priority: An optional priority for the Data Rows in the Batch. 1 highest -> 5 lowest
-            consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3, 'coverage_percentage': 0.1}
+            consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3,
+                'coverage_percentage': 0.1}
+
+        Returns: the created batch
         """
 
         # @TODO: make this automatic?
@@ -773,6 +779,156 @@ def create_batch(
             return self._create_batch_sync(name, dr_ids, global_keys, priority,
                                            consensus_settings)
 
+    def create_batches(
+        self,
+        name_prefix: str,
+        data_rows: Optional[List[Union[str, DataRow]]] = None,
+        global_keys: Optional[List[str]] = None,
+        priority: int = 5,
+        consensus_settings: Optional[Dict[str, float]] = None,
+    ) -> CreateBatchesTask:
+        """
+        Creates batches for a project from a list of data rows. One of `global_keys` or `data_rows` must be provided,
+        but not both. When more than 100k data rows are specified and thus multiple batches are needed, the specific
+        batch that each data row will be placed in is undefined.
+
+        Batches will be created with the specified name prefix and a unique suffix. The suffix will be a 4-digit
+        number starting at 0000. For example, if the name prefix is "batch" and 3 batches are created, the names
+        will be "batch0000", "batch0001", and "batch0002". This method will throw an error if a batch with the same
+        name already exists.
+
+        Args:
+            name_prefix: a prefix for the batch names, must be unique within a project
+            data_rows: Either a list of `DataRows` or Data Row ids.
+            global_keys: global keys for data rows to add to the batch.
+            priority: An optional priority for the Data Rows in the Batch. 1 highest -> 5 lowest
+            consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3,
+                'coverage_percentage': 0.1}
+
+        Returns: a task for the created batches
+        """
+
+        if self.queue_mode != QueueMode.Batch:
+            raise ValueError("Project must be in batch mode")
+
+        dr_ids = []
+        if data_rows is not None:
+            for dr in data_rows:
+                if isinstance(dr, Entity.DataRow):
+                    dr_ids.append(dr.uid)
+                elif isinstance(dr, str):
+                    dr_ids.append(dr)
+                else:
+                    raise ValueError(
+                        "`data_rows` must be DataRow ids or DataRow objects")
+
+        self._wait_until_data_rows_are_processed(
+            dr_ids, global_keys, self._wait_processing_max_seconds)
+
+        if consensus_settings:
+            consensus_settings = ConsensusSettings(**consensus_settings).dict(
+                by_alias=True)
+
+        method = 'createBatches'
+        mutation_str = """mutation %sPyApi($projectId: ID!, $input: CreateBatchesInput!) {
+                                      project(where: {id: $projectId}) {
+                                        %s(input: $input) {
+                                          tasks {
+                                            batchUuid
+                                            taskId
+                                          }
+                                        }
+                                      }
+                                    }
+                                """ % (method, method)
+
+        params = {
+            "projectId": self.uid,
+            "input": {
+                "batchNamePrefix": name_prefix,
+                "dataRowIds": dr_ids,
+                "globalKeys": global_keys,
+                "priority": priority,
+                "consensusSettings": consensus_settings
+            }
+        }
+
+        tasks = self.client.execute(
+            mutation_str, params, experimental=True)["project"][method]["tasks"]
+        batch_ids = [task["batchUuid"] for task in tasks]
+        task_ids = [task["taskId"] for task in tasks]
+
+        return CreateBatchesTask(self.client, self.uid, batch_ids, task_ids)
+
+    def create_batches_from_dataset(
+        self,
+        name_prefix: str,
+        dataset_id: str,
+        priority: int = 5,
+        consensus_settings: Optional[Dict[str,
+                                          float]] = None) -> CreateBatchesTask:
+        """
+        Creates batches for a project from a dataset, selecting only the data rows that are not already added to the
+        project. When the dataset contains more than 100k data rows and multiple batches are needed, the specific batch
+        that each data row will be placed in is undefined. Note that data rows may not be immediately available for a
+        project after being added to a dataset; use the `_wait_until_data_rows_are_processed` method to ensure that
+        data rows are available before creating batches.
+
+        Batches will be created with the specified name prefix and a unique suffix. The suffix will be a 4-digit
+        number starting at 0000. For example, if the name prefix is "batch" and 3 batches are created, the names
+        will be "batch0000", "batch0001", and "batch0002". This method will throw an error if a batch with the same
+        name already exists.
+
+        Args:
+            name_prefix: a prefix for the batch names, must be unique within a project
+            dataset_id: the id of the dataset to create batches from
+            priority: An optional priority for the Data Rows in the Batch. 1 highest -> 5 lowest
+            consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3,
+                'coverage_percentage': 0.1}
+
+        Returns: a task for the created batches
+        """
+
+        if self.queue_mode != QueueMode.Batch:
+            raise ValueError("Project must be in batch mode")
+
+        if consensus_settings:
+            consensus_settings = ConsensusSettings(**consensus_settings).dict(
+                by_alias=True)
+
+        print("Creating batches from dataset %s", dataset_id)
+
+        method = 'createBatchesFromDataset'
+        mutation_str = """mutation %sPyApi($projectId: ID!, $input: CreateBatchesFromDatasetInput!) {
+                                        project(where: {id: $projectId}) {
+                                            %s(input: $input) {
+                                              tasks {
+                                                batchUuid
+                                                taskId
+                                              }
+                                            }
+                                        }
+                                    }
+                                """ % (method, method)
+
+        params = {
+            "projectId": self.uid,
+            "input": {
+                "batchNamePrefix": name_prefix,
+                "datasetId": dataset_id,
+                "priority": priority,
+                "consensusSettings": consensus_settings
+            }
+        }
+
+        tasks = self.client.execute(
+            mutation_str, params, experimental=True)["project"][method]["tasks"]
+
+        batch_ids = [task["batchUuid"] for task in tasks]
+        task_ids = [task["taskId"] for task in tasks]
+
+        return CreateBatchesTask(self.client, self.uid, batch_ids, task_ids)
+
     def _create_batch_sync(self, name, dr_ids, global_keys, priority,
                            consensus_settings):
         method = 'createBatchV2'
@@ -843,7 +999,7 @@ def _create_batch_async(self,
         add_data_rows_mutation_str = """mutation %sPyApi($projectId: ID!, $input: AddDataRowsToBatchInput!) {
                                       project(where: {id: $projectId}) {
                                         %s(input: $input) {
-                                            taskId
+                                          taskId
                                         }
                                       }
                                     }
@@ -871,29 +1027,7 @@ def _create_batch_async(self,
             raise LabelboxError(f"Batch was not created successfully: " +
                                 json.dumps(task.errors))
 
-        # obtain batch entity to return
-        get_batch_str = """query %s($projectId: ID!, $batchId: ID!) {
-                          project(where: {id: $projectId}) {
-                             batches(where: {id: $batchId}) {
-                                nodes {
-                                   %s
-                                }
-                             }
-                        }
-                    }
-                    """ % ("getProjectBatchPyApi",
-                           query.results_query_part(Entity.Batch))
-
-        batch = self.client.execute(
-            get_batch_str, {
-                "projectId": self.uid,
-                "batchId": batch_id
-            },
-            timeout=180.0,
-            experimental=True)["project"]["batches"]["nodes"][0]
-
-        # TODO async endpoints currently do not provide failed_data_row_ids in response
-        return Entity.Batch(self.client, self.uid, batch)
+        return self.client.get_batch(self.uid, batch_id)
 
     def _update_queue_mode(self, mode: "QueueMode") -> "QueueMode":
         """
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 75474ab4d..ed4229b4d 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -5,13 +5,13 @@
 import uuid
 from enum import Enum
 from types import SimpleNamespace
-from typing import Type
+from typing import Type, List
 
 import pytest
 import requests
 
-from labelbox import Client, MediaType
-from labelbox import LabelingFrontend, Dataset
+from labelbox import Client, Dataset
+from labelbox import LabelingFrontend
 from labelbox import OntologyBuilder, Tool, Option, Classification, MediaType
 from labelbox.orm import query
 from labelbox.pagination import PaginatedCollection
@@ -768,3 +768,42 @@ def is_adv_enabled(client) -> bool:
     query_str = "query IsAdvEnabledPyApi { user { isAdvEnabled } }"
     response = client.execute(query_str)
     return bool(response['user']['isAdvEnabled'])
+
+
+IMAGE_URL = "https://storage.googleapis.com/diagnostics-demo-data/coco/COCO_train2014_000000000034.jpg"
+EXTERNAL_ID = "my-image"
+
+
+@pytest.fixture
+def big_dataset(dataset: Dataset):
+    task = dataset.create_data_rows([
+        {
+            "row_data": IMAGE_URL,
+            "external_id": EXTERNAL_ID
+        },
+    ] * 3)
+    task.wait_till_done()
+
+    yield dataset
+
+
+@pytest.fixture
+def big_dataset_data_row_ids(big_dataset: Dataset) -> List[str]:
+    yield [dr.uid for dr in list(big_dataset.export_data_rows())]
+
+
+@pytest.fixture(scope='function')
+def dataset_with_invalid_data_rows(unique_dataset: Dataset):
+    upload_invalid_data_rows_for_dataset(unique_dataset)
+
+    yield unique_dataset
+
+
+def upload_invalid_data_rows_for_dataset(dataset: Dataset):
+    task = dataset.create_data_rows([
+        {
+            "row_data": 'gs://invalid-bucket/example.png',  # forbidden
+            "external_id": "image-without-access.jpg"
+        },
+    ] * 2)
+    task.wait_till_done()
diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py
index 485bf308c..40eb632ef 100644
--- a/tests/integration/test_batch.py
+++ b/tests/integration/test_batch.py
@@ -1,53 +1,22 @@
 import time
+from typing import List
+from uuid import uuid4
+
 import pytest
 
-from uuid import uuid4
 from labelbox import Dataset, Project
 from labelbox.exceptions import ProcessingWaitTimeout, MalformedQueryException, ResourceConflict, LabelboxError
-
-IMAGE_URL = "https://storage.googleapis.com/diagnostics-demo-data/coco/COCO_train2014_000000000034.jpg"
-EXTERNAL_ID = "my-image"
+from integration.conftest import upload_invalid_data_rows_for_dataset, IMAGE_URL, EXTERNAL_ID
 
 
 def get_data_row_ids(ds: Dataset):
     return [dr.uid for dr in list(ds.export_data_rows())]
 
 
-@pytest.fixture
-def big_dataset(dataset: Dataset):
-    task = dataset.create_data_rows([
-        {
-            "row_data": IMAGE_URL,
-            "external_id": EXTERNAL_ID
-        },
-    ] * 3)
-    task.wait_till_done()
-
-    yield dataset
-
-
-@pytest.fixture(scope='function')
-def dataset_with_invalid_data_rows(unique_dataset: Dataset):
-    upload_invalid_data_rows_for_dataset(unique_dataset)
-
-    yield unique_dataset
-
-
-def upload_invalid_data_rows_for_dataset(dataset: Dataset):
-    task = dataset.create_data_rows([
-        {
-            "row_data": 'gs://invalid-bucket/example.png',  # forbidden
-            "external_id": "image-without-access.jpg"
-        },
-    ] * 2)
-    task.wait_till_done()
-
-
-def test_create_batch(project: Project, big_dataset: Dataset):
-    data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())]
-    batch = project.create_batch("test-batch", data_rows, 3)
+def test_create_batch(project: Project, big_dataset_data_row_ids: List[str]):
+    batch = project.create_batch("test-batch", big_dataset_data_row_ids, 3)
     assert batch.name == "test-batch"
-    assert batch.size == len(data_rows)
+    assert batch.size == len(big_dataset_data_row_ids)
 
 
 def test_create_batch_with_invalid_data_rows_ids(project: Project):
@@ -125,11 +94,13 @@ def test_create_batch_with_float_number_priority(project: Project,
                              priority=4.9)
 
 
-def test_create_batch_async(project: Project, big_dataset: Dataset):
-    data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())]
-    batch = project._create_batch_async("big-batch", data_rows, priority=3)
+def test_create_batch_async(project: Project,
+                            big_dataset_data_row_ids: List[str]):
+    batch = project._create_batch_async("big-batch",
+                                        big_dataset_data_row_ids,
+                                        priority=3)
     assert batch.name == "big-batch"
-    assert batch.size == len(data_rows)
+    assert batch.size == len(big_dataset_data_row_ids)
 
 
 def test_create_batch_with_consensus_settings(project: Project,
diff --git a/tests/integration/test_batches.py b/tests/integration/test_batches.py
new file mode 100644
index 000000000..12a4a4355
--- /dev/null
+++ b/tests/integration/test_batches.py
@@ -0,0 +1,36 @@
+from typing import List
+
+import pytest
+
+from labelbox import Project, Dataset
+
+
+def test_create_batches(project: Project, big_dataset_data_row_ids: List[str]):
+    task = project.create_batches("test-batch",
+                                  big_dataset_data_row_ids,
+                                  priority=3)
+
+    task.wait_till_done()
+    assert task.errors() is None
+    batches = task.result()
+
+    assert len(batches) == 1
+    assert batches[0].name == "test-batch0000"
+    assert batches[0].size == len(big_dataset_data_row_ids)
+
+
+def test_create_batches_from_dataset(project: Project, big_dataset: Dataset):
+    data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())]
+    project._wait_until_data_rows_are_processed(data_rows, [], 300)
+
+    task = project.create_batches_from_dataset("test-batch",
+                                               big_dataset.uid,
+                                               priority=3)
+
+    task.wait_till_done()
+    assert task.errors() is None
+    batches = task.result()
+
+    assert len(batches) == 1
+    assert batches[0].name == "test-batch0000"
+    assert batches[0].size == len(data_rows)

From 20ebe5f4165aacb4166febd73e82c3837ef33d12 Mon Sep 17 00:00:00 2001
From: Attila Papai <97034214+attila-papai@users.noreply.github.com>
Date: Thu, 17 Aug 2023 21:21:48 +0200
Subject: [PATCH 03/20] [AL-6729] assert adv task errors separately (#1214)

---
 tests/integration/test_task.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tests/integration/test_task.py b/tests/integration/test_task.py
index d591b16a9..b035b09ed 100644
--- a/tests/integration/test_task.py
+++ b/tests/integration/test_task.py
@@ -8,7 +8,7 @@
 TEXT_SCHEMA_ID = "cko8s9r5v0001h2dk9elqdidh"
 
 
-def test_task_errors(dataset, image_url, snapshot):
+def test_task_errors(dataset, image_url, snapshot, is_adv_enabled):
     client = dataset.client
     task = dataset.create_data_rows([
         {
@@ -25,16 +25,22 @@ def test_task_errors(dataset, image_url, snapshot):
 
     assert task in client.get_user().created_tasks()
     task.wait_till_done()
-    # assert task.status == "FAILED"
-    # assert len(task.failed_data_rows) > 0
-    snapshot.snapshot_dir = INTEGRATION_SNAPSHOT_DIRECTORY
-    # RowData is dynamic, so we need to remove it from the snapshot
-    task.failed_data_rows[0]['failedDataRows'][0]['rowData'] = ''
-    snapshot.assert_match(json.dumps(task.failed_data_rows),
-                          'test_task.test_task_errors.failed_data_rows.json')
-    assert task.errors is not None
-    snapshot.assert_match(json.dumps(task.errors),
-                          'test_task.test_task_errors.errors.json')
+    if is_adv_enabled:
+        assert len(task.failed_data_rows) == 1
+        assert "A schemaId can only be specified once per DataRow : [cko8s9r5v0001h2dk9elqdidh]" in task.failed_data_rows[
+            0]['message']
+        assert len(
+            task.failed_data_rows[0]['failedDataRows'][0]['metadata']) == 2
+    else:
+        snapshot.snapshot_dir = INTEGRATION_SNAPSHOT_DIRECTORY
+        # RowData is dynamic, so we need to remove it from the snapshot
+        task.failed_data_rows[0]['failedDataRows'][0]['rowData'] = ''
+        snapshot.assert_match(
+            json.dumps(task.failed_data_rows),
+            'test_task.test_task_errors.failed_data_rows.json')
+        assert task.errors is not None
+        snapshot.assert_match(json.dumps(task.errors),
+                              'test_task.test_task_errors.errors.json')
 
 
 def test_task_success_json(dataset, image_url, snapshot):

From 20710d6fb4ce2b2f1d1fc4e55af3879ab6316e5c Mon Sep 17 00:00:00 2001
From: Attila Papai <97034214+attila-papai@users.noreply.github.com>
Date: Fri, 18 Aug 2023 09:41:25 +0200
Subject: [PATCH 04/20] [AL-6740] assert adv global key errors separately
 (#1215)

---
 tests/integration/test_global_keys.py | 37 +++++++++++++++++----------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/tests/integration/test_global_keys.py b/tests/integration/test_global_keys.py
index 25ca7ba33..6b9588c18 100644
--- a/tests/integration/test_global_keys.py
+++ b/tests/integration/test_global_keys.py
@@ -116,7 +116,8 @@ def test_long_global_key_validation(client, dataset, image_url):
         'error'] == 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid'
 
 
-def test_global_key_with_whitespaces_validation(client, dataset, image_url):
+def test_global_key_with_whitespaces_validation(client, dataset, image_url,
+                                                is_adv_enabled):
     dr_1 = dataset.create_data_row(row_data=image_url)
     dr_2 = dataset.create_data_row(row_data=image_url)
     dr_3 = dataset.create_data_row(row_data=image_url)
@@ -137,19 +138,27 @@ def test_global_key_with_whitespaces_validation(client, dataset, image_url):
     }]
     res = client.assign_global_keys_to_data_rows(assignment_inputs)
 
-    assert len(res['results']) == 0
-    assert len(res['errors']) == 3
-    assert res['status'] == 'FAILURE'
-    assign_errors_ids = set([e['data_row_id'] for e in res['errors']])
-    assign_errors_gks = set([e['global_key'] for e in res['errors']])
-    assign_errors_msgs = set([e['error'] for e in res['errors']])
-    assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid])
-    assert assign_errors_gks == set([gk_1, gk_2, gk_3])
-    assert assign_errors_msgs == set([
-        'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
-        'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
-        'Invalid assignment. Either DataRow does not exist, or globalKey is invalid'
-    ])
+    if is_adv_enabled:
+        assert res['status'] == 'PARTIAL SUCCESS'
+        assert len(res['results']) == 2
+        assert len(res['errors']) == 1
+        assert res['errors'][0]['global_key'] == gk_3
+        assert res['errors'][0][
+            'error'] == "Invalid assignment. Either DataRow does not exist, or globalKey is invalid"
+    else:
+        assert len(res['results']) == 0
+        assert len(res['errors']) == 3
+        assert res['status'] == 'FAILURE'
+        assign_errors_ids = set([e['data_row_id'] for e in res['errors']])
+        assign_errors_gks = set([e['global_key'] for e in res['errors']])
+        assign_errors_msgs = set([e['error'] for e in res['errors']])
+        assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid])
+        assert assign_errors_gks == set([gk_1, gk_2, gk_3])
+        assert assign_errors_msgs == set([
+            'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
+            'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
+            'Invalid assignment. Either DataRow does not exist, or globalKey is invalid'
+        ])
 
 
 def test_get_data_row_ids_for_global_keys(client, dataset, image_url):

From b439780d1a8a164e3b1c143c3cd86e910bfd95dd Mon Sep 17 00:00:00 2001
From: Attila Papai <97034214+attila-papai@users.noreply.github.com>
Date: Fri, 18 Aug 2023 17:37:57 +0200
Subject: [PATCH 05/20] Remove sleep between creating and exporting a batch
 (#1213)

---
 tests/integration/export_v2/test_legacy_export.py | 1 -
 tests/integration/test_batch.py                   | 2 --
 tests/integration/test_project.py                 | 2 --
 3 files changed, 5 deletions(-)

diff --git a/tests/integration/export_v2/test_legacy_export.py b/tests/integration/export_v2/test_legacy_export.py
index 071e8254d..e4e1e595c 100644
--- a/tests/integration/export_v2/test_legacy_export.py
+++ b/tests/integration/export_v2/test_legacy_export.py
@@ -171,7 +171,6 @@ def test_export_data_rows(project: Project, dataset: Dataset):
 
     data_rows = [dr.uid for dr in list(dataset.export_data_rows())]
     batch = project.create_batch("batch test", data_rows)
-
     result = list(batch.export_data_rows())
     exported_data_rows = [dr.uid for dr in result]
 
diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py
index 40eb632ef..600762817 100644
--- a/tests/integration/test_batch.py
+++ b/tests/integration/test_batch.py
@@ -211,8 +211,6 @@ def test_export_data_rows(project: Project, dataset: Dataset):
 
     data_rows = [dr.uid for dr in list(dataset.export_data_rows())]
     batch = project.create_batch("batch test", data_rows)
-    # allow time for catapult to sync changes to ES
-    time.sleep(5)
     result = list(batch.export_data_rows())
     exported_data_rows = [dr.uid for dr in result]
 
diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py
index 20b36533a..b9467e0dd 100644
--- a/tests/integration/test_project.py
+++ b/tests/integration/test_project.py
@@ -228,8 +228,6 @@ def test_create_batch_with_global_keys_sync(project: Project, data_rows):
     global_keys = [dr.global_key for dr in data_rows]
     batch_name = f'batch {uuid.uuid4()}'
     batch = project.create_batch(batch_name, global_keys=global_keys)
-    # allow time for catapult to sync changes to ES
-    time.sleep(5)
     # TODO: Move to export_v2
     batch_data_rows = set(batch.export_data_rows())
     assert batch_data_rows == set(data_rows)

From be939096cad7ed1e45ce1c0cb54823245bfdb28b Mon Sep 17 00:00:00 2001
From: Attila Papai <97034214+attila-papai@users.noreply.github.com>
Date: Fri, 18 Aug 2023 17:38:57 +0200
Subject: [PATCH 06/20] Improve ADV assertion for bulk sync import (#1212)

---
 tests/integration/test_data_rows.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py
index 8248304b3..6a89bf9d4 100644
--- a/tests/integration/test_data_rows.py
+++ b/tests/integration/test_data_rows.py
@@ -929,9 +929,7 @@ def test_data_row_bulk_creation_sync_with_same_global_keys(
         dataset, sample_image, is_adv_enabled):
     global_key_1 = str(uuid.uuid4())
 
-    if is_adv_enabled:
-        # ADV does not throw an error for duplicate global keys
-        # but rather create the first one and reject the second
+    with pytest.raises(labelbox.exceptions.MalformedQueryException) as exc_info:
         dataset.create_data_rows_sync([{
             DataRow.row_data: sample_image,
             DataRow.global_key: global_key_1
@@ -939,18 +937,13 @@ def test_data_row_bulk_creation_sync_with_same_global_keys(
             DataRow.row_data: sample_image,
             DataRow.global_key: global_key_1
         }])
+
+    if is_adv_enabled:
+        # ADV will import the first data row but not the second (duplicate global key)
         assert len(list(dataset.data_rows())) == 1
-        assert list(dataset.data_rows())[0].global_key == global_key_1
+        assert "Some data rows were not imported. Check error output here" in str(
+            exc_info.value)
     else:
-        with pytest.raises(labelbox.exceptions.MalformedQueryException):
-            dataset.create_data_rows_sync([{
-                DataRow.row_data: sample_image,
-                DataRow.global_key: global_key_1
-            }, {
-                DataRow.row_data: sample_image,
-                DataRow.global_key: global_key_1
-            }])
-
         assert len(list(dataset.data_rows())) == 0
 
         dataset.create_data_rows_sync([{

From 38c5776f65670a0cfac9a11667a87ab6b2ae1039 Mon Sep 17 00:00:00 2001
From: Attila Papai <97034214+attila-papai@users.noreply.github.com>
Date: Mon, 21 Aug 2023 15:33:13 +0200
Subject: [PATCH 07/20] [AL-6929] adjust ADV specific asserts (#1216)

---
 tests/integration/test_data_rows.py   |  1 +
 tests/integration/test_global_keys.py | 37 ++++++++++-----------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/tests/integration/test_data_rows.py b/tests/integration/test_data_rows.py
index 6a89bf9d4..fdc4e7eb4 100644
--- a/tests/integration/test_data_rows.py
+++ b/tests/integration/test_data_rows.py
@@ -941,6 +941,7 @@ def test_data_row_bulk_creation_sync_with_same_global_keys(
     if is_adv_enabled:
         # ADV will import the first data row but not the second (duplicate global key)
         assert len(list(dataset.data_rows())) == 1
+        assert list(dataset.data_rows())[0].global_key == global_key_1
         assert "Some data rows were not imported. Check error output here" in str(
             exc_info.value)
     else:
diff --git a/tests/integration/test_global_keys.py b/tests/integration/test_global_keys.py
index 6b9588c18..25ca7ba33 100644
--- a/tests/integration/test_global_keys.py
+++ b/tests/integration/test_global_keys.py
@@ -116,8 +116,7 @@ def test_long_global_key_validation(client, dataset, image_url):
         'error'] == 'Invalid assignment. Either DataRow does not exist, or globalKey is invalid'
 
 
-def test_global_key_with_whitespaces_validation(client, dataset, image_url,
-                                                is_adv_enabled):
+def test_global_key_with_whitespaces_validation(client, dataset, image_url):
     dr_1 = dataset.create_data_row(row_data=image_url)
     dr_2 = dataset.create_data_row(row_data=image_url)
     dr_3 = dataset.create_data_row(row_data=image_url)
@@ -138,27 +137,19 @@ def test_global_key_with_whitespaces_validation(client, dataset, image_url,
     }]
     res = client.assign_global_keys_to_data_rows(assignment_inputs)
 
-    if is_adv_enabled:
-        assert res['status'] == 'PARTIAL SUCCESS'
-        assert len(res['results']) == 2
-        assert len(res['errors']) == 1
-        assert res['errors'][0]['global_key'] == gk_3
-        assert res['errors'][0][
-            'error'] == "Invalid assignment. Either DataRow does not exist, or globalKey is invalid"
-    else:
-        assert len(res['results']) == 0
-        assert len(res['errors']) == 3
-        assert res['status'] == 'FAILURE'
-        assign_errors_ids = set([e['data_row_id'] for e in res['errors']])
-        assign_errors_gks = set([e['global_key'] for e in res['errors']])
-        assign_errors_msgs = set([e['error'] for e in res['errors']])
-        assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid])
-        assert assign_errors_gks == set([gk_1, gk_2, gk_3])
-        assert assign_errors_msgs == set([
-            'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
-            'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
-            'Invalid assignment. Either DataRow does not exist, or globalKey is invalid'
-        ])
+    assert len(res['results']) == 0
+    assert len(res['errors']) == 3
+    assert res['status'] == 'FAILURE'
+    assign_errors_ids = set([e['data_row_id'] for e in res['errors']])
+    assign_errors_gks = set([e['global_key'] for e in res['errors']])
+    assign_errors_msgs = set([e['error'] for e in res['errors']])
+    assert assign_errors_ids == set([dr_1.uid, dr_2.uid, dr_3.uid])
+    assert assign_errors_gks == set([gk_1, gk_2, gk_3])
+    assert assign_errors_msgs == set([
+        'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
+        'Invalid assignment. Either DataRow does not exist, or globalKey is invalid',
+        'Invalid assignment. Either DataRow does not exist, or globalKey is invalid'
+    ])
 
 
 def test_get_data_row_ids_for_global_keys(client, dataset, image_url):

From 91e1127f42362077d9df8fe15b73f3a43d0be181 Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Mon, 21 Aug 2023 14:26:48 -0700
Subject: [PATCH 08/20] Try and fix test_filtering flaky test

by removing dataset query testing part, since it is not applicable to BATCH projects any more
---
 tests/integration/test_filtering.py | 39 +++--------------------------
 1 file changed, 4 insertions(+), 35 deletions(-)

diff --git a/tests/integration/test_filtering.py b/tests/integration/test_filtering.py
index 5cd185258..fde7f0638 100644
--- a/tests/integration/test_filtering.py
+++ b/tests/integration/test_filtering.py
@@ -24,18 +24,15 @@ def project_to_test_where(client, rand_gen):
 
 # Avoid assertions using equality to prevent intermittent failures due to
 # other builds simultaneously adding projects to test org
-def test_where(client, image_url, project_to_test_where, rand_gen):
+def test_where(client, project_to_test_where):
     p_a, p_b, p_c = project_to_test_where
-    p_a_name, p_b_name, p_c_name = [p.name for p in [p_a, p_b, p_c]]
+    p_a_name, p_b_name, _ = [p.name for p in [p_a, p_b, p_c]]
 
-    def _get(f, where=None):
+    def get(where=None):
         date_where = Project.created_at >= p_a.created_at
         where = date_where if where is None else where & date_where
         return {p.uid for p in client.get_projects(where)}
 
-    def get(where=None):
-        return _get(client.get_projects, where)
-
     assert {p_a.uid, p_b.uid, p_c.uid}.issubset(get())
     e_a = get(Project.name == p_a_name)
     assert p_a.uid in e_a and p_b.uid not in e_a and p_c.uid not in e_a
@@ -50,34 +47,6 @@ def get(where=None):
     le_b = get(Project.name <= p_b_name)
     assert {p_a.uid, p_b.uid}.issubset(le_b) and p_c.uid not in le_b
 
-    dataset = client.create_dataset(name="Dataset")
-    data_row = dataset.create_data_row(row_data=image_url)
-    data_row_ids = [data_row.uid]
-    batch = p_a.create_batch(
-        rand_gen(str),
-        data_row_ids,  # sample of data row objects
-        5  # priority between 1(Highest) - 5(lowest)
-    )
-
-    def get(where=None):
-        return _get(batch.project, where)
-
-    assert {p_a.uid, p_b.uid, p_c.uid}.issubset(get())
-    e_a = get(Project.name == p_a_name)
-    assert p_a.uid in e_a and p_b.uid not in e_a and p_c.uid not in e_a
-    not_b = get(Project.name != p_b_name)
-    assert {p_a.uid, p_c.uid}.issubset(not_b) and p_b.uid not in not_b
-    gt_b = get(Project.name > p_b_name)
-    assert p_c.uid in gt_b and p_a.uid not in gt_b and p_b.uid not in gt_b
-    lt_b = get(Project.name < p_b_name)
-    assert p_a.uid in lt_b and p_b.uid not in lt_b and p_c.uid not in lt_b
-    ge_b = get(Project.name >= p_b_name)
-    assert {p_b.uid, p_c.uid}.issubset(ge_b) and p_a.uid not in ge_b
-    le_b = get(Project.name <= p_b_name)
-    assert {p_a.uid, p_b.uid}.issubset(le_b) and p_c.uid not in le_b
-
-    batch.delete()
-
 
 def test_unsupported_where(client):
     with pytest.raises(InvalidQueryError):
@@ -89,4 +58,4 @@ def test_unsupported_where(client):
                             (Project.description == "b"))
 
     with pytest.raises(InvalidQueryError):
-        client.get_projects(where=~(Project.name == "a"))
+        client.get_projects(where=~(Project.name == "a"))
\ No newline at end of file

From 48285e46b2ae509a8c1ccdc4141459ddd5ad006a Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Wed, 2 Aug 2023 16:20:23 -0700
Subject: [PATCH 09/20] Add instrumentation for fixtures(temp)

---
 .../integration/annotation_import/conftest.py | 13 ++++++-
 tests/integration/conftest.py                 | 36 +++++++++++++++++++
 tests/integration/test_dataset.py             |  8 +++--
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index 6db398fe5..6e35d4d0a 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -9,6 +9,7 @@
 from typing import Type
 from labelbox.schema.labeling_frontend import LabelingFrontend
 from labelbox.schema.annotation_import import LabelImport, AnnotationImportState
+from labelbox.schema.project import Project
 from labelbox.schema.queue_mode import QueueMode
 
 DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS = 40
@@ -486,6 +487,7 @@ def initial_dataset(client, rand_gen):
 
 @pytest.fixture
 def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
+    start_time = time.time()
     dataset = initial_dataset
     project = client.create_project(
         name=rand_gen(str),
@@ -496,14 +498,21 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
             where=LabelingFrontend.name == "editor"))[0]
     project.setup(editor, ontology)
     data_row_ids = []
-
+    # print("Before creating data rows ", time.time() - start_time)
+    num_rows = 0
     for _ in range(len(ontology['tools']) + len(ontology['classifications'])):
         data_row_ids.append(dataset.create_data_row(row_data=image_url).uid)
+        num_rows += 1
+    # print("After creating data rows ", time.time() - start_time)
+
+    pytest.data_row_report['times'] += time.time() - start_time
+    pytest.data_row_report['num_rows'] += num_rows
     project.create_batch(
         rand_gen(str),
         data_row_ids,  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
     )
+    print("After creating batch ", time.time() - start_time)
     project.data_row_ids = data_row_ids
     yield project
     project.delete()
@@ -1006,6 +1015,7 @@ def model_run_with_training_metadata(rand_gen, model):
 @pytest.fixture
 def model_run_with_data_rows(client, configured_project, model_run_predictions,
                              model_run, wait_for_label_processing):
+    start_time = time.time()
     configured_project.enable_model_assisted_labeling()
 
     upload_task = LabelImport.create_from_objects(
@@ -1019,6 +1029,7 @@ def model_run_with_data_rows(client, configured_project, model_run_predictions,
     labels = wait_for_label_processing(configured_project)
     label_ids = [label.uid for label in labels]
     model_run.upsert_labels(label_ids)
+    print(f"model_run_with_data_rows: {time.time() - start_time}")
     yield model_run
     model_run.delete()
     # TODO: Delete resources when that is possible ..
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index ed4229b4d..92e23a375 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+from itertools import islice
 import json
 import os
 import re
@@ -807,3 +809,37 @@ def upload_invalid_data_rows_for_dataset(dataset: Dataset):
         },
     ] * 2)
     task.wait_till_done()
+
+
+def pytest_configure():
+    pytest.report = defaultdict(int)
+    pytest.data_row_report = {'times': 0, 'num_rows': 0}
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_fixture_setup(fixturedef, request):
+    start = time.time()
+    yield
+
+    end = time.time()
+
+    exec_time = end - start
+    pytest.report[fixturedef.argname] += exec_time
+
+    # print('pytest_fixture_setup'
+    #       f', request={request}'
+    #       f', create_data_row_time={end - start}')
+
+
+@pytest.fixture(scope='session', autouse=True)
+def print_perf_summary():
+    yield
+
+    sorted_dict = dict(
+        sorted(pytest.report.items(), key=lambda item: item[1], reverse=True))
+    num_of_entries = 10 if len(sorted_dict) >= 10 else len(sorted_dict)
+    slowest_fixtures = [
+        (aaa, sorted_dict[aaa]) for aaa in islice(sorted_dict, num_of_entries)
+    ]
+    print("\nTop slowest fixtures:\n", slowest_fixtures)
+    print("Data row report:\n", pytest.data_row_report)
diff --git a/tests/integration/test_dataset.py b/tests/integration/test_dataset.py
index d1a31e532..de2f15820 100644
--- a/tests/integration/test_dataset.py
+++ b/tests/integration/test_dataset.py
@@ -53,8 +53,12 @@ def dataset_for_filtering(client, rand_gen):
 
     yield name_1, d1, name_2, d2
 
-    d1.delete()
-    d2.delete()
+
+def test_dataset_filtering(client, dataset_for_filtering):
+    name_1, d1, name_2, d2 = dataset_for_filtering
+
+    assert list(client.get_datasets(where=Dataset.name == name_1)) == [d1]
+    assert list(client.get_datasets(where=Dataset.name == name_2)) == [d2]
 
 
 def test_dataset_filtering(client, dataset_for_filtering):

From b95d1b89767c8ea6f085596315d056c8982b9f99 Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Mon, 14 Aug 2023 09:16:26 -0700
Subject: [PATCH 10/20] Convert tests that do now require many data rows
 prebuilt to a simpler project fixture

---
 pytest.ini                                    |  2 +-
 .../integration/annotation_import/conftest.py | 13 ++---
 .../test_bulk_import_request.py               | 49 +++++++++----------
 .../annotation_import/test_data_types.py      | 22 ++++-----
 tests/integration/conftest.py                 | 29 +++++++----
 tests/integration/test_project.py             | 14 ++----
 6 files changed, 64 insertions(+), 65 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index b56afefdd..fbf64a864 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,4 @@
 [pytest]
-addopts = -s -vv --reruns 5 --reruns-delay 10 --durations=20
+addopts = -s -vv
 markers =
     slow: marks tests as slow (deselect with '-m "not slow"')
diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index 6e35d4d0a..1f88de47a 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -486,17 +486,12 @@ def initial_dataset(client, rand_gen):
 
 
 @pytest.fixture
-def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
+def configured_project(client, configured_project_without_data_rows,
+                       initial_dataset, ontology, rand_gen, image_url):
     start_time = time.time()
     dataset = initial_dataset
-    project = client.create_project(
-        name=rand_gen(str),
-        queue_mode=QueueMode.Batch,
-    )
-    editor = list(
-        client.get_labeling_frontends(
-            where=LabelingFrontend.name == "editor"))[0]
-    project.setup(editor, ontology)
+    project = configured_project_without_data_rows
+
     data_row_ids = []
     # print("Before creating data rows ", time.time() - start_time)
     num_rows = 0
diff --git a/tests/integration/annotation_import/test_bulk_import_request.py b/tests/integration/annotation_import/test_bulk_import_request.py
index 4f001af8d..7a66dd667 100644
--- a/tests/integration/annotation_import/test_bulk_import_request.py
+++ b/tests/integration/annotation_import/test_bulk_import_request.py
@@ -25,15 +25,15 @@
 """
 
 
-def test_create_from_url(configured_project):
+def test_create_from_url(project):
     name = str(uuid.uuid4())
     url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson"
 
-    bulk_import_request = configured_project.upload_annotations(name=name,
-                                                                annotations=url,
-                                                                validate=False)
+    bulk_import_request = project.upload_annotations(name=name,
+                                                     annotations=url,
+                                                     validate=False)
 
-    assert bulk_import_request.project() == configured_project
+    assert bulk_import_request.project() == project
     assert bulk_import_request.name == name
     assert bulk_import_request.input_file_url == url
     assert bulk_import_request.error_file_url is None
@@ -41,24 +41,24 @@ def test_create_from_url(configured_project):
     assert bulk_import_request.state == BulkImportRequestState.RUNNING
 
 
-def test_validate_file(configured_project):
+def test_validate_file(project_with_ontology):
     name = str(uuid.uuid4())
     url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson"
     with pytest.raises(MALValidationError):
-        configured_project.upload_annotations(name=name,
-                                              annotations=url,
-                                              validate=True)
+        project_with_ontology.upload_annotations(name=name,
+                                                 annotations=url,
+                                                 validate=True)
         #Schema ids shouldn't match
 
 
-def test_create_from_objects(configured_project, predictions,
+def test_create_from_objects(configured_project_without_data_rows, predictions,
                              annotation_import_test_helpers):
     name = str(uuid.uuid4())
 
-    bulk_import_request = configured_project.upload_annotations(
+    bulk_import_request = configured_project_without_data_rows.upload_annotations(
         name=name, annotations=predictions)
 
-    assert bulk_import_request.project() == configured_project
+    assert bulk_import_request.project() == configured_project_without_data_rows
     assert bulk_import_request.name == name
     assert bulk_import_request.error_file_url is None
     assert bulk_import_request.status_file_url is None
@@ -105,17 +105,17 @@ def test_create_from_local_file(tmp_path, predictions, configured_project,
         bulk_import_request.input_file_url, predictions)
 
 
-def test_get(client, configured_project):
+def test_get(client, configured_project_without_data_rows):
     name = str(uuid.uuid4())
     url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson"
-    configured_project.upload_annotations(name=name,
-                                          annotations=url,
-                                          validate=False)
+    configured_project_without_data_rows.upload_annotations(name=name,
+                                                            annotations=url,
+                                                            validate=False)
 
     bulk_import_request = BulkImportRequest.from_name(
-        client, project_id=configured_project.uid, name=name)
+        client, project_id=configured_project_without_data_rows.uid, name=name)
 
-    assert bulk_import_request.project() == configured_project
+    assert bulk_import_request.project() == configured_project_without_data_rows
     assert bulk_import_request.name == name
     assert bulk_import_request.input_file_url == url
     assert bulk_import_request.error_file_url is None
@@ -158,14 +158,13 @@ def test_validate_ndjson_uuid(tmp_path, configured_project, predictions):
 
 
 @pytest.mark.slow
-def test_wait_till_done(rectangle_inference, configured_project):
+def test_wait_till_done(rectangle_inference,
+                        configured_project_without_data_rows):
     name = str(uuid.uuid4())
-    url = configured_project.client.upload_data(content=parser.dumps(
-        [rectangle_inference]),
-                                                sign=True)
-    bulk_import_request = configured_project.upload_annotations(name=name,
-                                                                annotations=url,
-                                                                validate=False)
+    url = configured_project_without_data_rows.client.upload_data(
+        content=parser.dumps([rectangle_inference]), sign=True)
+    bulk_import_request = configured_project_without_data_rows.upload_annotations(
+        name=name, annotations=url, validate=False)
 
     assert len(bulk_import_request.inputs) == 1
     bulk_import_request.wait_until_done()
diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py
index f8e392cf5..30559198b 100644
--- a/tests/integration/annotation_import/test_data_types.py
+++ b/tests/integration/annotation_import/test_data_types.py
@@ -125,7 +125,6 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name):
         [data_row.uid],  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
     )
-    project.data_row_ids.append(data_row.uid)
 
     return data_row
 
@@ -135,12 +134,12 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name):
     AudioData, ConversationData, DicomData, DocumentData, HTMLData, ImageData,
     TextData
 ])
-def test_import_data_types(client, configured_project, initial_dataset,
-                           rand_gen, data_row_json_by_data_type,
-                           annotations_by_data_type, data_type_class):
+def test_import_data_types(client, project, initial_dataset, rand_gen,
+                           data_row_json_by_data_type, annotations_by_data_type,
+                           data_type_class):
 
-    project = configured_project
-    project_id = configured_project.uid
+    project = project
+    project_id = project.uid
     dataset = initial_dataset
 
     set_project_media_type_from_data_type(project, data_type_class)
@@ -261,11 +260,11 @@ def test_import_data_types_v2(client, configured_project, initial_dataset,
 
 
 @pytest.mark.parametrize('data_type, data_class, annotations', test_params)
-def test_import_label_annotations(client, configured_project, initial_dataset,
-                                  data_row_json_by_data_type, data_type,
-                                  data_class, annotations, rand_gen):
+def test_import_label_annotations(client, configured_project_without_data_rows,
+                                  initial_dataset, data_row_json_by_data_type,
+                                  data_type, data_class, annotations, rand_gen):
 
-    project = configured_project
+    project = configured_project_without_data_rows
     dataset = initial_dataset
     set_project_media_type_from_data_type(project, data_class)
 
@@ -297,7 +296,8 @@ def test_import_label_annotations(client, configured_project, initial_dataset,
     assert export_task.errors is None
     expected_annotations = get_annotation_comparison_dicts_from_labels(labels)
     actual_annotations = get_annotation_comparison_dicts_from_export(
-        export_task.result, data_row.uid, configured_project.uid)
+        export_task.result, data_row.uid,
+        configured_project_without_data_rows.uid)
     assert actual_annotations == expected_annotations
     data_row.delete()
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 92e23a375..c47524ed6 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -3,6 +3,7 @@
 import json
 import os
 import re
+import sys
 import time
 import uuid
 from enum import Enum
@@ -390,9 +391,21 @@ def initial_dataset(client, rand_gen):
 
 
 @pytest.fixture
-def configured_project(project, initial_dataset, client, rand_gen, image_url):
+def project_with_ontology(project):
+    editor = list(
+        project.client.get_labeling_frontends(
+            where=LabelingFrontend.name == "editor"))[0]
+    empty_ontology = {"tools": [], "classifications": []}
+    project.setup(editor, empty_ontology)
+    yield project
+
+
+@pytest.fixture
+def configured_project(project_with_ontology, initial_dataset, rand_gen,
+                       image_url):
     dataset = initial_dataset
     data_row_id = dataset.create_data_row(row_data=image_url).uid
+    project = project_with_ontology
 
     project.create_batch(
         rand_gen(str),
@@ -401,14 +414,7 @@ def configured_project(project, initial_dataset, client, rand_gen, image_url):
     )
     project.data_row_ids = [data_row_id]
 
-    editor = list(
-        project.client.get_labeling_frontends(
-            where=LabelingFrontend.name == "editor"))[0]
-    empty_ontology = {"tools": [], "classifications": []}
-    project.setup(editor, empty_ontology)
     yield project
-    dataset.delete()
-    project.delete()
 
 
 @pytest.fixture
@@ -833,6 +839,8 @@ def pytest_fixture_setup(fixturedef, request):
 
 @pytest.fixture(scope='session', autouse=True)
 def print_perf_summary():
+    print("Starting measurements\n", file=sys.stderr)
+
     yield
 
     sorted_dict = dict(
@@ -841,5 +849,6 @@ def print_perf_summary():
     slowest_fixtures = [
         (aaa, sorted_dict[aaa]) for aaa in islice(sorted_dict, num_of_entries)
     ]
-    print("\nTop slowest fixtures:\n", slowest_fixtures)
-    print("Data row report:\n", pytest.data_row_report)
+    print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr)
+    print("Data row report:\n", pytest.data_row_report, file=sys.stderr)
+    # assert False
diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py
index b9467e0dd..b3b683a3d 100644
--- a/tests/integration/test_project.py
+++ b/tests/integration/test_project.py
@@ -171,15 +171,15 @@ def test_attach_instructions(client, project):
 
 @pytest.mark.skipif(condition=os.environ['LABELBOX_TEST_ENVIRON'] == "onprem",
                     reason="new mutation does not work for onprem")
-def test_html_instructions(configured_project):
+def test_html_instructions(project_with_ontology):
     html_file_path = '/tmp/instructions.html'
     sample_html_str = "<html></html>"
 
     with open(html_file_path, 'w') as file:
         file.write(sample_html_str)
 
-    configured_project.upsert_instructions(html_file_path)
-    updated_ontology = configured_project.ontology().normalized
+    project_with_ontology.upsert_instructions(html_file_path)
+    updated_ontology = project_with_ontology.ontology().normalized
 
     instructions = updated_ontology.pop('projectInstructions')
     assert requests.get(instructions).text == sample_html_str
@@ -200,10 +200,6 @@ def test_same_ontology_after_instructions(
     assert instructions is not None
 
 
-def test_queue_mode(configured_project: Project):
-    assert configured_project.queue_mode == QueueMode.Batch
-
-
 def test_batches(project: Project, dataset: Dataset, image_url):
     task = dataset.create_data_rows([
         {
@@ -243,9 +239,9 @@ def test_create_batch_with_global_keys_async(project: Project, data_rows):
     assert batch_data_rows == set(data_rows)
 
 
-def test_media_type(client, configured_project: Project, rand_gen):
+def test_media_type(client, project: Project, rand_gen):
     # Existing project with no media_type
-    assert isinstance(configured_project.media_type, MediaType)
+    assert isinstance(project.media_type, MediaType)
 
     # Update test
     project = client.create_project(name=rand_gen(str))

From 4976908f25ece76697cda14efe952d061f69a92c Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Tue, 15 Aug 2023 15:05:36 -0700
Subject: [PATCH 11/20] Adding an option to configure source of data rows for
 predictions, also making ndjson test use project without datatows

---
 .../integration/annotation_import/conftest.py |  81 +++++++++-
 .../test_ndjson_validation.py                 | 138 +++++++++++-------
 tests/integration/conftest.py                 |   2 -
 3 files changed, 158 insertions(+), 63 deletions(-)

diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index 1f88de47a..ca34d2dfb 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -486,8 +486,27 @@ def initial_dataset(client, rand_gen):
 
 
 @pytest.fixture
-def configured_project(client, configured_project_without_data_rows,
-                       initial_dataset, ontology, rand_gen, image_url):
+def hardcoded_datarow_id():
+    data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q'
+
+    def get_data_row_id(indx=0):
+        return data_row_id
+
+    yield get_data_row_id
+
+
+@pytest.fixture
+def configured_project_datarow_id(configured_project):
+
+    def get_data_row_id(indx=0):
+        return configured_project.data_row_ids[indx]
+
+    yield get_data_row_id
+
+
+@pytest.fixture
+def configured_project(configured_project_without_data_rows, initial_dataset,
+                       ontology, rand_gen, image_url):
     start_time = time.time()
     dataset = initial_dataset
     project = configured_project_without_data_rows
@@ -509,6 +528,7 @@ def configured_project(client, configured_project_without_data_rows,
     )
     print("After creating batch ", time.time() - start_time)
     project.data_row_ids = data_row_ids
+
     yield project
     project.delete()
 
@@ -577,10 +597,19 @@ def configured_project_without_data_rows(client, ontology, rand_gen):
 # In an example of a 'rectangle' we have extended to support multiple instances of the same tool type
 # TODO: we will support this approach in the future for all tools
 @pytest.fixture
-def prediction_id_mapping(configured_project):
+def prediction_id_mapping(configured_project_without_data_rows, ontology,
+                          request):
     # Maps tool types to feature schema ids
-    project = configured_project
+    if 'configured_project' in request.fixturenames:
+        data_row_id_factory = request.getfixturevalue(
+            'configured_project_datarow_id')
+        project = configured_project
+    else:
+        data_row_id_factory = request.getfixturevalue('hardcoded_datarow_id')
+        project = configured_project_without_data_rows
+
     ontology = project.ontology().normalized
+
     result = {}
 
     for idx, tool in enumerate(ontology['tools'] + ontology['classifications']):
@@ -597,7 +626,47 @@ def prediction_id_mapping(configured_project):
                 "schemaId": tool['featureSchemaId'],
                 "name": tool['name'],
                 "dataRow": {
-                    "id": project.data_row_ids[idx],
+                    "id": data_row_id_factory(idx),
+                },
+                'tool': tool
+            }
+            if tool_type not in result:
+                result[tool_type] = []
+            result[tool_type].append(value)
+        else:
+            result[tool_type] = {
+                "uuid": str(uuid.uuid4()),
+                "schemaId": tool['featureSchemaId'],
+                "name": tool['name'],
+                "dataRow": {
+                    "id": data_row_id_factory(idx),
+                },
+                'tool': tool
+            }
+    return result
+
+
+@pytest.fixture
+def prediction_id_mapping_datarow_id():
+    # Maps tool types to feature schema ids
+    data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q'
+    result = {}
+
+    for _, tool in enumerate(ontology['tools'] + ontology['classifications']):
+        if 'tool' in tool:
+            tool_type = tool['tool']
+        else:
+            tool_type = tool[
+                'type'] if 'scope' not in tool else f"{tool['type']}_{tool['scope']}"  # so 'checklist' of 'checklist_index'
+
+        # TODO: remove this once we have a better way to associate multiple tools instances with a single tool type
+        if tool_type == 'rectangle':
+            value = {
+                "uuid": str(uuid.uuid4()),
+                "schemaId": tool['featureSchemaId'],
+                "name": tool['name'],
+                "dataRow": {
+                    "id": data_row_id,
                 },
                 'tool': tool
             }
@@ -610,7 +679,7 @@ def prediction_id_mapping(configured_project):
                 "schemaId": tool['featureSchemaId'],
                 "name": tool['name'],
                 "dataRow": {
-                    "id": project.data_row_ids[idx],
+                    "id": data_row_id,
                 },
                 'tool': tool
             }
diff --git a/tests/integration/annotation_import/test_ndjson_validation.py b/tests/integration/annotation_import/test_ndjson_validation.py
index 53bb85eed..466968e85 100644
--- a/tests/integration/annotation_import/test_ndjson_validation.py
+++ b/tests/integration/annotation_import/test_ndjson_validation.py
@@ -9,6 +9,24 @@
                                                  NDRadio, NDRectangle, NDText,
                                                  NDTextEntity, NDTool,
                                                  _validate_ndjson)
+from labelbox.schema.labeling_frontend import LabelingFrontend
+from labelbox.schema.queue_mode import QueueMode
+
+
+@pytest.fixture
+def configured_project_without_data_rows(client,
+                                         ontology,
+                                         rand_gen,
+                                         scope="module"):
+    project = client.create_project(name=rand_gen(str),
+                                    description=rand_gen(str),
+                                    queue_mode=QueueMode.Batch)
+    editor = list(
+        client.get_labeling_frontends(
+            where=LabelingFrontend.name == "editor"))[0]
+    project.setup(editor, ontology)
+    yield project
+    project.delete()
 
 
 def test_classification_construction(checklist_inference, text_inference):
@@ -37,187 +55,198 @@ def test_tool_construction(inference, expected_type):
 
 
 def test_incorrect_feature_schema(rectangle_inference, polygon_inference,
-                                  configured_project):
+                                  configured_project_without_data_rows):
     #Valid but incorrect feature schema
     #Prob the error message says something about the config not anything useful. We might want to fix this.
     pred = rectangle_inference.copy()
     pred['schemaId'] = polygon_inference['schemaId']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def no_tool(text_inference, configured_project):
+def no_tool(text_inference, configured_project_without_data_rows):
     pred = text_inference.copy()
     #Missing key
     del pred['answer']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_invalid_text(text_inference, configured_project):
+def test_invalid_text(text_inference, configured_project_without_data_rows):
     #and if it is not a string
     pred = text_inference.copy()
     #Extra and wrong key
     del pred['answer']
     pred['answers'] = []
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
     del pred['answers']
 
     #Invalid type
     pred['answer'] = []
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
     #Invalid type
     pred['answer'] = None
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_invalid_checklist_item(checklist_inference, configured_project):
+def test_invalid_checklist_item(checklist_inference,
+                                configured_project_without_data_rows):
     #Only two points
     pred = checklist_inference.copy()
     pred['answers'] = [pred['answers'][0], pred['answers'][0]]
     #Duplicate schema ids
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
     pred['answers'] = [{"name": "asdfg"}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
     pred['answers'] = [{"schemaId": "1232132132"}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
     pred['answers'] = [{}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
     pred['answers'] = []
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
     del pred['answers']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_invalid_polygon(polygon_inference, configured_project):
+def test_invalid_polygon(polygon_inference,
+                         configured_project_without_data_rows):
     #Only two points
     pred = polygon_inference.copy()
     pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_incorrect_entity(entity_inference, configured_project):
+def test_incorrect_entity(entity_inference,
+                          configured_project_without_data_rows):
     entity = entity_inference.copy()
     #Location cannot be a list
     entity["location"] = [0, 10]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([entity], configured_project)
+        _validate_ndjson([entity], configured_project_without_data_rows)
 
     entity["location"] = {"start": -1, "end": 5}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([entity], configured_project)
+        _validate_ndjson([entity], configured_project_without_data_rows)
 
     entity["location"] = {"start": 15, "end": 5}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([entity], configured_project)
+        _validate_ndjson([entity], configured_project_without_data_rows)
 
 
-def test_incorrect_mask(segmentation_inference, configured_project):
+def test_incorrect_mask(segmentation_inference,
+                        configured_project_without_data_rows):
     seg = segmentation_inference.copy()
     seg['mask']['colorRGB'] = [-1, 0, 10]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project)
+        _validate_ndjson([seg], configured_project_without_data_rows)
 
     seg['mask']['colorRGB'] = [0, 0]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project)
+        _validate_ndjson([seg], configured_project_without_data_rows)
 
     seg['mask'] = {'counts': [0], 'size': [0, 1]}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project)
+        _validate_ndjson([seg], configured_project_without_data_rows)
 
     seg['mask'] = {'counts': [-1], 'size': [1, 1]}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project)
+        _validate_ndjson([seg], configured_project_without_data_rows)
 
 
-def test_all_validate_json(configured_project, predictions):
+def test_all_validate_json(configured_project_without_data_rows, predictions):
     #Predictions contains one of each type of prediction.
     #These should be properly formatted and pass.
-    _validate_ndjson(predictions, configured_project)
+    _validate_ndjson(predictions, configured_project_without_data_rows)
 
 
-def test_incorrect_line(line_inference, configured_project):
+def test_incorrect_line(line_inference, configured_project_without_data_rows):
     line = line_inference.copy()
     line["line"] = [line["line"][0]]  #Just one point
     with pytest.raises(MALValidationError):
-        _validate_ndjson([line], configured_project)
+        _validate_ndjson([line], configured_project_without_data_rows)
 
 
-def test_incorrect_rectangle(rectangle_inference, configured_project):
+def test_incorrect_rectangle(rectangle_inference,
+                             configured_project_without_data_rows):
     del rectangle_inference['bbox']['top']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([rectangle_inference], configured_project)
+        _validate_ndjson([rectangle_inference],
+                         configured_project_without_data_rows)
 
 
-def test_duplicate_tools(rectangle_inference, configured_project):
+def test_duplicate_tools(rectangle_inference,
+                         configured_project_without_data_rows):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_invalid_feature_schema(configured_project, rectangle_inference):
+def test_invalid_feature_schema(configured_project_without_data_rows,
+                                rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     pred['schemaId'] = "blahblah"
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_name_only_feature_schema(configured_project, rectangle_inference):
+def test_name_only_feature_schema(configured_project_without_data_rows,
+                                  rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     del pred['schemaId']
-    _validate_ndjson([pred], configured_project)
+    _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_schema_id_only_feature_schema(configured_project, rectangle_inference):
+def test_schema_id_only_feature_schema(configured_project_without_data_rows,
+                                       rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     del pred['name']
-    _validate_ndjson([pred], configured_project)
+    _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_missing_feature_schema(configured_project, rectangle_inference):
+def test_missing_feature_schema(configured_project_without_data_rows,
+                                rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     del pred['schemaId']
     del pred['name']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project)
+        _validate_ndjson([pred], configured_project_without_data_rows)
 
 
-def test_validate_ndjson(tmp_path, configured_project):
+def test_validate_ndjson(tmp_path, configured_project_without_data_rows):
     file_name = f"broken.ndjson"
     file_path = tmp_path / file_name
     with file_path.open("w") as f:
         f.write("test")
 
     with pytest.raises(ValueError):
-        configured_project.upload_annotations(name="name",
-                                              annotations=str(file_path),
-                                              validate=True)
+        configured_project_without_data_rows.upload_annotations(
+            name="name", annotations=str(file_path), validate=True)
 
 
-def test_validate_ndjson_uuid(tmp_path, configured_project, predictions):
+def test_validate_ndjson_uuid(tmp_path, configured_project_without_data_rows,
+                              predictions):
     file_name = f"repeat_uuid.ndjson"
     file_path = tmp_path / file_name
     repeat_uuid = predictions.copy()
@@ -228,16 +257,15 @@ def test_validate_ndjson_uuid(tmp_path, configured_project, predictions):
         parser.dump(repeat_uuid, f)
 
     with pytest.raises(MALValidationError):
-        configured_project.upload_annotations(name="name",
-                                              validate=True,
-                                              annotations=str(file_path))
+        configured_project_without_data_rows.upload_annotations(
+            name="name", validate=True, annotations=str(file_path))
 
     with pytest.raises(MALValidationError):
-        configured_project.upload_annotations(name="name",
-                                              validate=True,
-                                              annotations=repeat_uuid)
+        configured_project_without_data_rows.upload_annotations(
+            name="name", validate=True, annotations=repeat_uuid)
 
 
-def test_video_upload(video_checklist_inference, configured_project):
+def test_video_upload(video_checklist_inference,
+                      configured_project_without_data_rows):
     pred = video_checklist_inference.copy()
-    _validate_ndjson([pred], configured_project)
+    _validate_ndjson([pred], configured_project_without_data_rows)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index c47524ed6..82c739ddb 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -839,8 +839,6 @@ def pytest_fixture_setup(fixturedef, request):
 
 @pytest.fixture(scope='session', autouse=True)
 def print_perf_summary():
-    print("Starting measurements\n", file=sys.stderr)
-
     yield
 
     sorted_dict = dict(

From 551c1efe0cc87b60c1b2f6509f613b75cb432d4b Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Tue, 15 Aug 2023 15:24:56 -0700
Subject: [PATCH 12/20] Replacing configured_project

---
 pytest.ini                                    |   2 +-
 tests/conftest.py                             |   2 +-
 .../integration/annotation_import/conftest.py | 120 ++++++++---------
 .../test_bulk_import_request.py               |  41 +++---
 .../test_conversation_import.py               |   6 +-
 .../annotation_import/test_data_types.py      |  23 ++--
 .../annotation_import/test_label_import.py    |  19 ++-
 .../annotation_import/test_model.py           |   4 +-
 .../annotation_import/test_model_run.py       |  14 +-
 .../test_ndjson_validation.py                 | 121 +++++++++---------
 .../test_upsert_prediction_import.py          |  18 +--
 tests/integration/conftest.py                 |  15 ++-
 tests/integration/export_v2/conftest.py       |   2 +-
 .../export_v2/test_export_video.py            |   4 +-
 14 files changed, 194 insertions(+), 197 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index fbf64a864..b56afefdd 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,4 @@
 [pytest]
-addopts = -s -vv
+addopts = -s -vv --reruns 5 --reruns-delay 10 --durations=20
 markers =
     slow: marks tests as slow (deselect with '-m "not slow"')
diff --git a/tests/conftest.py b/tests/conftest.py
index b4dd6dce0..b724426d8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,7 +12,7 @@
 ]
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def rand_gen():
 
     def gen(field_type):
diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index ca34d2dfb..3f1cd7de5 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -211,7 +211,7 @@ def annotations_by_data_type_v2(
     }
 
 
-@pytest.fixture
+@pytest.fixture(scope='session')
 def ontology():
     bbox_tool_with_nested_text = {
         'required':
@@ -479,48 +479,45 @@ def func(project):
 
 
 @pytest.fixture
-def initial_dataset(client, rand_gen):
-    dataset = client.create_dataset(name=rand_gen(str))
-    yield dataset
-    dataset.delete()
-
-
-@pytest.fixture
-def hardcoded_datarow_id():
-    data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q'
+def configured_project_datarow_id(configured_project):
 
     def get_data_row_id(indx=0):
-        return data_row_id
+        return configured_project.data_row_ids[indx]
 
     yield get_data_row_id
 
 
 @pytest.fixture
-def configured_project_datarow_id(configured_project):
+def configured_project_one_datarow_id(configured_project_with_one_data_row):
 
     def get_data_row_id(indx=0):
-        return configured_project.data_row_ids[indx]
+        return configured_project_with_one_data_row.data_row_ids[0]
 
     yield get_data_row_id
 
 
 @pytest.fixture
-def configured_project(configured_project_without_data_rows, initial_dataset,
-                       ontology, rand_gen, image_url):
+def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
     start_time = time.time()
     dataset = initial_dataset
-    project = configured_project_without_data_rows
+    project = client.create_project(name=rand_gen(str),
+                                    queue_mode=QueueMode.Batch)
+    editor = list(
+        client.get_labeling_frontends(
+            where=LabelingFrontend.name == "editor"))[0]
+    project.setup(editor, ontology)
+    num_rows = 0
 
     data_row_ids = []
-    # print("Before creating data rows ", time.time() - start_time)
-    num_rows = 0
+
     for _ in range(len(ontology['tools']) + len(ontology['classifications'])):
         data_row_ids.append(dataset.create_data_row(row_data=image_url).uid)
         num_rows += 1
-    # print("After creating data rows ", time.time() - start_time)
-
-    pytest.data_row_report['times'] += time.time() - start_time
-    pytest.data_row_report['num_rows'] += num_rows
+    project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids,
+                                                sleep_interval=3)
+    if pytest.data_row_report:
+        pytest.data_row_report['times'] += time.time() - start_time
+        pytest.data_row_report['num_rows'] += num_rows
     project.create_batch(
         rand_gen(str),
         data_row_ids,  # sample of data row objects
@@ -580,7 +577,10 @@ def dataset_conversation_entity(client, rand_gen, conversation_entity_data_row,
 
 
 @pytest.fixture
-def configured_project_without_data_rows(client, ontology, rand_gen):
+def configured_project_with_one_data_row(client, ontology, rand_gen,
+                                         initial_dataset, image_url):
+    start_time = time.time()
+
     project = client.create_project(name=rand_gen(str),
                                     description=rand_gen(str),
                                     queue_mode=QueueMode.Batch)
@@ -588,7 +588,25 @@ def configured_project_without_data_rows(client, ontology, rand_gen):
         client.get_labeling_frontends(
             where=LabelingFrontend.name == "editor"))[0]
     project.setup(editor, ontology)
+
+    data_row = initial_dataset.create_data_row(row_data=image_url)
+    data_row_ids = [data_row.uid]
+    project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids,
+                                                sleep_interval=3)
+
+    if pytest.data_row_report:
+        pytest.data_row_report['times'] += time.time() - start_time
+        pytest.data_row_report['num_rows'] += 1
+    batch = project.create_batch(
+        rand_gen(str),
+        data_row_ids,  # sample of data row objects
+        5  # priority between 1(Highest) - 5(lowest)
+    )
+    project.data_row_ids = data_row_ids
+
     yield project
+
+    batch.delete()
     project.delete()
 
 
@@ -597,16 +615,20 @@ def configured_project_without_data_rows(client, ontology, rand_gen):
 # In an example of a 'rectangle' we have extended to support multiple instances of the same tool type
 # TODO: we will support this approach in the future for all tools
 @pytest.fixture
-def prediction_id_mapping(configured_project_without_data_rows, ontology,
-                          request):
+def prediction_id_mapping(ontology, request):
     # Maps tool types to feature schema ids
     if 'configured_project' in request.fixturenames:
         data_row_id_factory = request.getfixturevalue(
             'configured_project_datarow_id')
-        project = configured_project
-    else:
+        project = request.getfixturevalue('configured_project')
+    elif 'hardcoded_datarow_id' in request.fixturenames:
         data_row_id_factory = request.getfixturevalue('hardcoded_datarow_id')
-        project = configured_project_without_data_rows
+        project = request.getfixturevalue('configured_project_with_ontology')
+    else:
+        data_row_id_factory = request.getfixturevalue(
+            'configured_project_one_datarow_id')
+        project = request.getfixturevalue(
+            'configured_project_with_one_data_row')
 
     ontology = project.ontology().normalized
 
@@ -646,46 +668,6 @@ def prediction_id_mapping(configured_project_without_data_rows, ontology,
     return result
 
 
-@pytest.fixture
-def prediction_id_mapping_datarow_id():
-    # Maps tool types to feature schema ids
-    data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q'
-    result = {}
-
-    for _, tool in enumerate(ontology['tools'] + ontology['classifications']):
-        if 'tool' in tool:
-            tool_type = tool['tool']
-        else:
-            tool_type = tool[
-                'type'] if 'scope' not in tool else f"{tool['type']}_{tool['scope']}"  # so 'checklist' of 'checklist_index'
-
-        # TODO: remove this once we have a better way to associate multiple tools instances with a single tool type
-        if tool_type == 'rectangle':
-            value = {
-                "uuid": str(uuid.uuid4()),
-                "schemaId": tool['featureSchemaId'],
-                "name": tool['name'],
-                "dataRow": {
-                    "id": data_row_id,
-                },
-                'tool': tool
-            }
-            if tool_type not in result:
-                result[tool_type] = []
-            result[tool_type].append(value)
-        else:
-            result[tool_type] = {
-                "uuid": str(uuid.uuid4()),
-                "schemaId": tool['featureSchemaId'],
-                "name": tool['name'],
-                "dataRow": {
-                    "id": data_row_id,
-                },
-                'tool': tool
-            }
-    return result
-
-
 @pytest.fixture
 def polygon_inference(prediction_id_mapping):
     polygon = prediction_id_mapping['polygon'].copy()
@@ -1079,7 +1061,6 @@ def model_run_with_training_metadata(rand_gen, model):
 @pytest.fixture
 def model_run_with_data_rows(client, configured_project, model_run_predictions,
                              model_run, wait_for_label_processing):
-    start_time = time.time()
     configured_project.enable_model_assisted_labeling()
 
     upload_task = LabelImport.create_from_objects(
@@ -1093,7 +1074,6 @@ def model_run_with_data_rows(client, configured_project, model_run_predictions,
     labels = wait_for_label_processing(configured_project)
     label_ids = [label.uid for label in labels]
     model_run.upsert_labels(label_ids)
-    print(f"model_run_with_data_rows: {time.time() - start_time}")
     yield model_run
     model_run.delete()
     # TODO: Delete resources when that is possible ..
diff --git a/tests/integration/annotation_import/test_bulk_import_request.py b/tests/integration/annotation_import/test_bulk_import_request.py
index 7a66dd667..6691cc044 100644
--- a/tests/integration/annotation_import/test_bulk_import_request.py
+++ b/tests/integration/annotation_import/test_bulk_import_request.py
@@ -51,14 +51,14 @@ def test_validate_file(project_with_ontology):
         #Schema ids shouldn't match
 
 
-def test_create_from_objects(configured_project_without_data_rows, predictions,
+def test_create_from_objects(configured_project_with_one_data_row, predictions,
                              annotation_import_test_helpers):
     name = str(uuid.uuid4())
 
-    bulk_import_request = configured_project_without_data_rows.upload_annotations(
+    bulk_import_request = configured_project_with_one_data_row.upload_annotations(
         name=name, annotations=predictions)
 
-    assert bulk_import_request.project() == configured_project_without_data_rows
+    assert bulk_import_request.project() == configured_project_with_one_data_row
     assert bulk_import_request.name == name
     assert bulk_import_request.error_file_url is None
     assert bulk_import_request.status_file_url is None
@@ -105,17 +105,17 @@ def test_create_from_local_file(tmp_path, predictions, configured_project,
         bulk_import_request.input_file_url, predictions)
 
 
-def test_get(client, configured_project_without_data_rows):
+def test_get(client, configured_project_with_one_data_row):
     name = str(uuid.uuid4())
     url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson"
-    configured_project_without_data_rows.upload_annotations(name=name,
+    configured_project_with_one_data_row.upload_annotations(name=name,
                                                             annotations=url,
                                                             validate=False)
 
     bulk_import_request = BulkImportRequest.from_name(
-        client, project_id=configured_project_without_data_rows.uid, name=name)
+        client, project_id=configured_project_with_one_data_row.uid, name=name)
 
-    assert bulk_import_request.project() == configured_project_without_data_rows
+    assert bulk_import_request.project() == configured_project_with_one_data_row
     assert bulk_import_request.name == name
     assert bulk_import_request.input_file_url == url
     assert bulk_import_request.error_file_url is None
@@ -123,16 +123,15 @@ def test_get(client, configured_project_without_data_rows):
     assert bulk_import_request.state == BulkImportRequestState.RUNNING
 
 
-def test_validate_ndjson(tmp_path, configured_project):
+def test_validate_ndjson(tmp_path, configured_project_with_one_data_row):
     file_name = f"broken.ndjson"
     file_path = tmp_path / file_name
     with file_path.open("w") as f:
         f.write("test")
 
     with pytest.raises(ValueError):
-        configured_project.upload_annotations(name="name",
-                                              validate=True,
-                                              annotations=str(file_path))
+        configured_project_with_one_data_row.upload_annotations(
+            name="name", validate=True, annotations=str(file_path))
 
 
 def test_validate_ndjson_uuid(tmp_path, configured_project, predictions):
@@ -159,11 +158,11 @@ def test_validate_ndjson_uuid(tmp_path, configured_project, predictions):
 
 @pytest.mark.slow
 def test_wait_till_done(rectangle_inference,
-                        configured_project_without_data_rows):
+                        configured_project_with_one_data_row):
     name = str(uuid.uuid4())
-    url = configured_project_without_data_rows.client.upload_data(
+    url = configured_project_with_one_data_row.client.upload_data(
         content=parser.dumps([rectangle_inference]), sign=True)
-    bulk_import_request = configured_project_without_data_rows.upload_annotations(
+    bulk_import_request = configured_project_with_one_data_row.upload_annotations(
         name=name, annotations=url, validate=False)
 
     assert len(bulk_import_request.inputs) == 1
@@ -298,7 +297,7 @@ def test_pdf_mal_bbox(client, configured_project_pdf):
     assert import_annotations.errors == []
 
 
-def test_pdf_document_entity(client, configured_project_without_data_rows,
+def test_pdf_document_entity(client, configured_project_with_one_data_row,
                              dataset_pdf_entity, rand_gen):
     # for content "Metal-insulator (MI) transitions have been one of the" in OCR JSON extract tests/assets/arxiv-pdf_data_99-word-token-pdfs_0801.3483-lb-textlayer.json
     document_text_selection = DocumentTextSelection(
@@ -322,7 +321,7 @@ def test_pdf_document_entity(client, configured_project_without_data_rows,
 
     labels = []
     _, data_row_uids = dataset_pdf_entity
-    configured_project_without_data_rows.create_batch(
+    configured_project_with_one_data_row.create_batch(
         rand_gen(str),
         data_row_uids,  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
@@ -337,7 +336,7 @@ def test_pdf_document_entity(client, configured_project_without_data_rows,
 
     import_annotations = MALPredictionImport.create_from_objects(
         client=client,
-        project_id=configured_project_without_data_rows.uid,
+        project_id=configured_project_with_one_data_row.uid,
         name=f"import {str(uuid.uuid4())}",
         predictions=labels)
     import_annotations.wait_until_done()
@@ -346,14 +345,14 @@ def test_pdf_document_entity(client, configured_project_without_data_rows,
 
 
 def test_nested_video_object_annotations(client,
-                                         configured_project_without_data_rows,
+                                         configured_project_with_one_data_row,
                                          video_data,
                                          bbox_video_annotation_objects,
                                          rand_gen):
     labels = []
     _, data_row_uids = video_data
-    configured_project_without_data_rows.update(media_type=MediaType.Video)
-    configured_project_without_data_rows.create_batch(
+    configured_project_with_one_data_row.update(media_type=MediaType.Video)
+    configured_project_with_one_data_row.create_batch(
         rand_gen(str),
         data_row_uids,  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
@@ -365,7 +364,7 @@ def test_nested_video_object_annotations(client,
                   annotations=bbox_video_annotation_objects))
     import_annotations = MALPredictionImport.create_from_objects(
         client=client,
-        project_id=configured_project_without_data_rows.uid,
+        project_id=configured_project_with_one_data_row.uid,
         name=f"import {str(uuid.uuid4())}",
         predictions=labels)
     import_annotations.wait_until_done()
diff --git a/tests/integration/annotation_import/test_conversation_import.py b/tests/integration/annotation_import/test_conversation_import.py
index ac2d5419c..9f1d26e31 100644
--- a/tests/integration/annotation_import/test_conversation_import.py
+++ b/tests/integration/annotation_import/test_conversation_import.py
@@ -7,7 +7,7 @@
 from labelbox.schema.annotation_import import MALPredictionImport
 
 
-def test_conversation_entity(client, configured_project_without_data_rows,
+def test_conversation_entity(client, configured_project_with_one_data_row,
                              dataset_conversation_entity, rand_gen):
 
     conversation_entity_annotation = ConversationEntity(start=0,
@@ -20,7 +20,7 @@ def test_conversation_entity(client, configured_project_without_data_rows,
     labels = []
     _, data_row_uids = dataset_conversation_entity
 
-    configured_project_without_data_rows.create_batch(
+    configured_project_with_one_data_row.create_batch(
         rand_gen(str),
         data_row_uids,  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
@@ -35,7 +35,7 @@ def test_conversation_entity(client, configured_project_without_data_rows,
 
     import_annotations = MALPredictionImport.create_from_objects(
         client=client,
-        project_id=configured_project_without_data_rows.uid,
+        project_id=configured_project_with_one_data_row.uid,
         name=f"import {str(uuid.uuid4())}",
         predictions=labels)
 
diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py
index 30559198b..5de79f5cc 100644
--- a/tests/integration/annotation_import/test_data_types.py
+++ b/tests/integration/annotation_import/test_data_types.py
@@ -125,6 +125,7 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name):
         [data_row.uid],  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
     )
+    project.data_row_ids.append(data_row.uid)
 
     return data_row
 
@@ -134,11 +135,11 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name):
     AudioData, ConversationData, DicomData, DocumentData, HTMLData, ImageData,
     TextData
 ])
-def test_import_data_types(client, project, initial_dataset, rand_gen,
-                           data_row_json_by_data_type, annotations_by_data_type,
-                           data_type_class):
+def test_import_data_types(client, configured_project, initial_dataset,
+                           rand_gen, data_row_json_by_data_type,
+                           annotations_by_data_type, data_type_class):
 
-    project = project
+    project = configured_project
     project_id = project.uid
     dataset = initial_dataset
 
@@ -260,11 +261,11 @@ def test_import_data_types_v2(client, configured_project, initial_dataset,
 
 
 @pytest.mark.parametrize('data_type, data_class, annotations', test_params)
-def test_import_label_annotations(client, configured_project_without_data_rows,
+def test_import_label_annotations(client, configured_project_with_one_data_row,
                                   initial_dataset, data_row_json_by_data_type,
                                   data_type, data_class, annotations, rand_gen):
 
-    project = configured_project_without_data_rows
+    project = configured_project_with_one_data_row
     dataset = initial_dataset
     set_project_media_type_from_data_type(project, data_class)
 
@@ -297,13 +298,13 @@ def test_import_label_annotations(client, configured_project_without_data_rows,
     expected_annotations = get_annotation_comparison_dicts_from_labels(labels)
     actual_annotations = get_annotation_comparison_dicts_from_export(
         export_task.result, data_row.uid,
-        configured_project_without_data_rows.uid)
+        configured_project_with_one_data_row.uid)
     assert actual_annotations == expected_annotations
     data_row.delete()
 
 
 @pytest.mark.parametrize('data_type, data_class, annotations', test_params)
-def test_import_mal_annotations(client, configured_project_without_data_rows,
+def test_import_mal_annotations(client, configured_project_with_one_data_row,
                                 data_row_json_by_data_type, data_type,
                                 data_class, annotations, rand_gen):
 
@@ -311,10 +312,10 @@ def test_import_mal_annotations(client, configured_project_without_data_rows,
     data_row_json = data_row_json_by_data_type[data_type]
     data_row = dataset.create_data_row(data_row_json)
 
-    set_project_media_type_from_data_type(configured_project_without_data_rows,
+    set_project_media_type_from_data_type(configured_project_with_one_data_row,
                                           data_class)
 
-    configured_project_without_data_rows.create_batch(
+    configured_project_with_one_data_row.create_batch(
         rand_gen(str),
         [data_row.uid],
     )
@@ -326,7 +327,7 @@ def test_import_mal_annotations(client, configured_project_without_data_rows,
 
     import_annotations = lb.MALPredictionImport.create_from_objects(
         client=client,
-        project_id=configured_project_without_data_rows.uid,
+        project_id=configured_project_with_one_data_row.uid,
         name=f"import {str(uuid.uuid4())}",
         predictions=labels)
     import_annotations.wait_until_done()
diff --git a/tests/integration/annotation_import/test_label_import.py b/tests/integration/annotation_import/test_label_import.py
index 198ce2e3e..61c602c52 100644
--- a/tests/integration/annotation_import/test_label_import.py
+++ b/tests/integration/annotation_import/test_label_import.py
@@ -9,13 +9,16 @@
 """
 
 
-def test_create_from_url(client, configured_project,
+def test_create_from_url(client, configured_project_with_one_data_row,
                          annotation_import_test_helpers):
     name = str(uuid.uuid4())
     url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson"
     label_import = LabelImport.create_from_url(
-        client=client, project_id=configured_project.uid, name=name, url=url)
-    assert label_import.parent_id == configured_project.uid
+        client=client,
+        project_id=configured_project_with_one_data_row.uid,
+        name=name,
+        url=url)
+    assert label_import.parent_id == configured_project_with_one_data_row.uid
     annotation_import_test_helpers.check_running_state(label_import, name, url)
 
 
@@ -52,13 +55,17 @@ def test_create_from_objects(client, configured_project, object_predictions,
 #     annotation_import_test_helpers.assert_file_content(label_import.input_file_url, object_predictions)
 
 
-def test_get(client, configured_project, annotation_import_test_helpers):
+def test_get(client, configured_project_with_one_data_row,
+             annotation_import_test_helpers):
     name = str(uuid.uuid4())
     url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson"
     label_import = LabelImport.create_from_url(
-        client=client, project_id=configured_project.uid, name=name, url=url)
+        client=client,
+        project_id=configured_project_with_one_data_row.uid,
+        name=name,
+        url=url)
 
-    assert label_import.parent_id == configured_project.uid
+    assert label_import.parent_id == configured_project_with_one_data_row.uid
     annotation_import_test_helpers.check_running_state(label_import, name, url)
 
 
diff --git a/tests/integration/annotation_import/test_model.py b/tests/integration/annotation_import/test_model.py
index dcfe9ef2c..131ecd9d0 100644
--- a/tests/integration/annotation_import/test_model.py
+++ b/tests/integration/annotation_import/test_model.py
@@ -4,14 +4,14 @@
 from labelbox.exceptions import ResourceNotFoundError
 
 
-def test_model(client, configured_project, rand_gen):
+def test_model(client, configured_project_with_one_data_row, rand_gen):
     # Get all
     models = list(client.get_models())
     for m in models:
         assert isinstance(m, Model)
 
     # Create
-    ontology = configured_project.ontology()
+    ontology = configured_project_with_one_data_row.ontology()
     data = {"name": rand_gen(str), "ontology_id": ontology.uid}
     model = client.create_model(data["name"], data["ontology_id"])
     assert model.name == data["name"]
diff --git a/tests/integration/annotation_import/test_model_run.py b/tests/integration/annotation_import/test_model_run.py
index c94c78cde..328b38ba5 100644
--- a/tests/integration/annotation_import/test_model_run.py
+++ b/tests/integration/annotation_import/test_model_run.py
@@ -87,11 +87,12 @@ def test_model_run_data_rows_delete(model_run_with_data_rows):
     assert len(before) == len(after) + 1
 
 
-def test_model_run_upsert_data_rows(dataset, model_run, configured_project):
+def test_model_run_upsert_data_rows(dataset, model_run,
+                                    configured_project_with_one_data_row):
     n_model_run_data_rows = len(list(model_run.model_run_data_rows()))
     assert n_model_run_data_rows == 0
     data_row = dataset.create_data_row(row_data="test row data")
-    configured_project._wait_until_data_rows_are_processed(
+    configured_project_with_one_data_row._wait_until_data_rows_are_processed(
         data_row_ids=[data_row.uid])
     model_run.upsert_data_rows([data_row.uid])
     n_model_run_data_rows = len(list(model_run.model_run_data_rows()))
@@ -167,15 +168,14 @@ def get_model_run_status():
                                                errorMessage)
 
 
-def test_model_run_split_assignment_by_data_row_ids(model_run, dataset,
-                                                    image_url,
-                                                    configured_project):
-    n_data_rows = 10
+def test_model_run_split_assignment_by_data_row_ids(
+        model_run, dataset, image_url, configured_project_with_one_data_row):
+    n_data_rows = 2
     data_rows = dataset.create_data_rows([{
         "row_data": image_url
     } for _ in range(n_data_rows)])
     data_row_ids = [data_row['id'] for data_row in data_rows.result]
-    configured_project._wait_until_data_rows_are_processed(
+    configured_project_with_one_data_row._wait_until_data_rows_are_processed(
         data_row_ids=data_row_ids)
     model_run.upsert_data_rows(data_row_ids)
 
diff --git a/tests/integration/annotation_import/test_ndjson_validation.py b/tests/integration/annotation_import/test_ndjson_validation.py
index 466968e85..123752402 100644
--- a/tests/integration/annotation_import/test_ndjson_validation.py
+++ b/tests/integration/annotation_import/test_ndjson_validation.py
@@ -13,19 +13,27 @@
 from labelbox.schema.queue_mode import QueueMode
 
 
-@pytest.fixture
-def configured_project_without_data_rows(client,
-                                         ontology,
-                                         rand_gen,
-                                         scope="module"):
+@pytest.fixture(scope="module", autouse=True)
+def hardcoded_datarow_id():
+    data_row_id = 'ck8q9q9qj00003g5z3q1q9q9q'
+
+    def get_data_row_id(indx=0):
+        return data_row_id
+
+    yield get_data_row_id
+
+
+@pytest.fixture(scope="module", autouse=True)
+def configured_project_with_ontology(client, ontology, rand_gen):
     project = client.create_project(name=rand_gen(str),
-                                    description=rand_gen(str),
                                     queue_mode=QueueMode.Batch)
     editor = list(
         client.get_labeling_frontends(
             where=LabelingFrontend.name == "editor"))[0]
     project.setup(editor, ontology)
+
     yield project
+
     project.delete()
 
 
@@ -55,197 +63,194 @@ def test_tool_construction(inference, expected_type):
 
 
 def test_incorrect_feature_schema(rectangle_inference, polygon_inference,
-                                  configured_project_without_data_rows):
+                                  configured_project_with_ontology):
     #Valid but incorrect feature schema
     #Prob the error message says something about the config not anything useful. We might want to fix this.
     pred = rectangle_inference.copy()
     pred['schemaId'] = polygon_inference['schemaId']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def no_tool(text_inference, configured_project_without_data_rows):
+def no_tool(text_inference, configured_project_with_ontology):
     pred = text_inference.copy()
     #Missing key
     del pred['answer']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_invalid_text(text_inference, configured_project_without_data_rows):
+def test_invalid_text(text_inference, configured_project_with_ontology):
     #and if it is not a string
     pred = text_inference.copy()
     #Extra and wrong key
     del pred['answer']
     pred['answers'] = []
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
     del pred['answers']
 
     #Invalid type
     pred['answer'] = []
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
     #Invalid type
     pred['answer'] = None
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
 def test_invalid_checklist_item(checklist_inference,
-                                configured_project_without_data_rows):
+                                configured_project_with_ontology):
     #Only two points
     pred = checklist_inference.copy()
     pred['answers'] = [pred['answers'][0], pred['answers'][0]]
     #Duplicate schema ids
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
     pred['answers'] = [{"name": "asdfg"}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
     pred['answers'] = [{"schemaId": "1232132132"}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
     pred['answers'] = [{}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
     pred['answers'] = []
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
     del pred['answers']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_invalid_polygon(polygon_inference,
-                         configured_project_without_data_rows):
+def test_invalid_polygon(polygon_inference, configured_project_with_ontology):
     #Only two points
     pred = polygon_inference.copy()
     pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_incorrect_entity(entity_inference,
-                          configured_project_without_data_rows):
+def test_incorrect_entity(entity_inference, configured_project_with_ontology):
     entity = entity_inference.copy()
     #Location cannot be a list
     entity["location"] = [0, 10]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([entity], configured_project_without_data_rows)
+        _validate_ndjson([entity], configured_project_with_ontology)
 
     entity["location"] = {"start": -1, "end": 5}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([entity], configured_project_without_data_rows)
+        _validate_ndjson([entity], configured_project_with_ontology)
 
     entity["location"] = {"start": 15, "end": 5}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([entity], configured_project_without_data_rows)
+        _validate_ndjson([entity], configured_project_with_ontology)
 
 
 def test_incorrect_mask(segmentation_inference,
-                        configured_project_without_data_rows):
+                        configured_project_with_ontology):
     seg = segmentation_inference.copy()
     seg['mask']['colorRGB'] = [-1, 0, 10]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project_without_data_rows)
+        _validate_ndjson([seg], configured_project_with_ontology)
 
     seg['mask']['colorRGB'] = [0, 0]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project_without_data_rows)
+        _validate_ndjson([seg], configured_project_with_ontology)
 
     seg['mask'] = {'counts': [0], 'size': [0, 1]}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project_without_data_rows)
+        _validate_ndjson([seg], configured_project_with_ontology)
 
     seg['mask'] = {'counts': [-1], 'size': [1, 1]}
     with pytest.raises(MALValidationError):
-        _validate_ndjson([seg], configured_project_without_data_rows)
+        _validate_ndjson([seg], configured_project_with_ontology)
 
 
-def test_all_validate_json(configured_project_without_data_rows, predictions):
+def test_all_validate_json(configured_project_with_ontology, predictions):
     #Predictions contains one of each type of prediction.
     #These should be properly formatted and pass.
-    _validate_ndjson(predictions, configured_project_without_data_rows)
+    _validate_ndjson(predictions, configured_project_with_ontology)
 
 
-def test_incorrect_line(line_inference, configured_project_without_data_rows):
+def test_incorrect_line(line_inference, configured_project_with_ontology):
     line = line_inference.copy()
     line["line"] = [line["line"][0]]  #Just one point
     with pytest.raises(MALValidationError):
-        _validate_ndjson([line], configured_project_without_data_rows)
+        _validate_ndjson([line], configured_project_with_ontology)
 
 
 def test_incorrect_rectangle(rectangle_inference,
-                             configured_project_without_data_rows):
+                             configured_project_with_ontology):
     del rectangle_inference['bbox']['top']
     with pytest.raises(MALValidationError):
         _validate_ndjson([rectangle_inference],
-                         configured_project_without_data_rows)
+                         configured_project_with_ontology)
 
 
-def test_duplicate_tools(rectangle_inference,
-                         configured_project_without_data_rows):
+def test_duplicate_tools(rectangle_inference, configured_project_with_ontology):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     pred['polygon'] = [{"x": 100, "y": 100}, {"x": 200, "y": 200}]
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_invalid_feature_schema(configured_project_without_data_rows,
+def test_invalid_feature_schema(configured_project_with_ontology,
                                 rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     pred['schemaId'] = "blahblah"
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_name_only_feature_schema(configured_project_without_data_rows,
+def test_name_only_feature_schema(configured_project_with_ontology,
                                   rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     del pred['schemaId']
-    _validate_ndjson([pred], configured_project_without_data_rows)
+    _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_schema_id_only_feature_schema(configured_project_without_data_rows,
+def test_schema_id_only_feature_schema(configured_project_with_ontology,
                                        rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     del pred['name']
-    _validate_ndjson([pred], configured_project_without_data_rows)
+    _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_missing_feature_schema(configured_project_without_data_rows,
+def test_missing_feature_schema(configured_project_with_ontology,
                                 rectangle_inference):
     #Trying to upload a polygon and rectangle at the same time
     pred = rectangle_inference.copy()
     del pred['schemaId']
     del pred['name']
     with pytest.raises(MALValidationError):
-        _validate_ndjson([pred], configured_project_without_data_rows)
+        _validate_ndjson([pred], configured_project_with_ontology)
 
 
-def test_validate_ndjson(tmp_path, configured_project_without_data_rows):
+def test_validate_ndjson(tmp_path, configured_project_with_ontology):
     file_name = f"broken.ndjson"
     file_path = tmp_path / file_name
     with file_path.open("w") as f:
         f.write("test")
 
     with pytest.raises(ValueError):
-        configured_project_without_data_rows.upload_annotations(
+        configured_project_with_ontology.upload_annotations(
             name="name", annotations=str(file_path), validate=True)
 
 
-def test_validate_ndjson_uuid(tmp_path, configured_project_without_data_rows,
+def test_validate_ndjson_uuid(tmp_path, configured_project_with_ontology,
                               predictions):
     file_name = f"repeat_uuid.ndjson"
     file_path = tmp_path / file_name
@@ -257,15 +262,15 @@ def test_validate_ndjson_uuid(tmp_path, configured_project_without_data_rows,
         parser.dump(repeat_uuid, f)
 
     with pytest.raises(MALValidationError):
-        configured_project_without_data_rows.upload_annotations(
+        configured_project_with_ontology.upload_annotations(
             name="name", validate=True, annotations=str(file_path))
 
     with pytest.raises(MALValidationError):
-        configured_project_without_data_rows.upload_annotations(
+        configured_project_with_ontology.upload_annotations(
             name="name", validate=True, annotations=repeat_uuid)
 
 
 def test_video_upload(video_checklist_inference,
-                      configured_project_without_data_rows):
+                      configured_project_with_ontology):
     pred = video_checklist_inference.copy()
-    _validate_ndjson([pred], configured_project_without_data_rows)
+    _validate_ndjson([pred], configured_project_with_ontology)
diff --git a/tests/integration/annotation_import/test_upsert_prediction_import.py b/tests/integration/annotation_import/test_upsert_prediction_import.py
index 9ed045f5e..927b6526d 100644
--- a/tests/integration/annotation_import/test_upsert_prediction_import.py
+++ b/tests/integration/annotation_import/test_upsert_prediction_import.py
@@ -13,7 +13,7 @@
 @pytest.mark.skip()
 def test_create_from_url(client, tmp_path, object_predictions,
                          model_run_with_data_rows,
-                         configured_project_without_data_rows,
+                         configured_project_with_one_data_row,
                          annotation_import_test_helpers):
     name = str(uuid.uuid4())
     file_name = f"{name}.json"
@@ -41,7 +41,7 @@ def test_create_from_url(client, tmp_path, object_predictions,
     annotation_import, batch, mal_prediction_import = model_run_with_data_rows.upsert_predictions_and_send_to_project(
         name=name,
         predictions=url,
-        project_id=configured_project_without_data_rows.uid,
+        project_id=configured_project_with_one_data_row.uid,
         priority=5)
 
     assert annotation_import.model_run_id == model_run_with_data_rows.uid
@@ -50,7 +50,7 @@ def test_create_from_url(client, tmp_path, object_predictions,
     assert annotation_import.statuses
 
     assert batch
-    assert batch.project().uid == configured_project_without_data_rows.uid
+    assert batch.project().uid == configured_project_with_one_data_row.uid
 
     assert mal_prediction_import
     mal_prediction_import.wait_until_done()
@@ -61,7 +61,7 @@ def test_create_from_url(client, tmp_path, object_predictions,
 
 @pytest.mark.skip()
 def test_create_from_objects(model_run_with_data_rows,
-                             configured_project_without_data_rows,
+                             configured_project_with_one_data_row,
                              object_predictions,
                              annotation_import_test_helpers):
     name = str(uuid.uuid4())
@@ -76,7 +76,7 @@ def test_create_from_objects(model_run_with_data_rows,
     annotation_import, batch, mal_prediction_import = model_run_with_data_rows.upsert_predictions_and_send_to_project(
         name=name,
         predictions=predictions,
-        project_id=configured_project_without_data_rows.uid,
+        project_id=configured_project_with_one_data_row.uid,
         priority=5)
 
     assert annotation_import.model_run_id == model_run_with_data_rows.uid
@@ -85,7 +85,7 @@ def test_create_from_objects(model_run_with_data_rows,
     assert annotation_import.statuses
 
     assert batch
-    assert batch.project().uid == configured_project_without_data_rows.uid
+    assert batch.project().uid == configured_project_with_one_data_row.uid
 
     assert mal_prediction_import
     mal_prediction_import.wait_until_done()
@@ -96,7 +96,7 @@ def test_create_from_objects(model_run_with_data_rows,
 
 @pytest.mark.skip()
 def test_create_from_local_file(tmp_path, model_run_with_data_rows,
-                                configured_project_without_data_rows,
+                                configured_project_with_one_data_row,
                                 object_predictions,
                                 annotation_import_test_helpers):
 
@@ -119,7 +119,7 @@ def test_create_from_local_file(tmp_path, model_run_with_data_rows,
     annotation_import, batch, mal_prediction_import = model_run_with_data_rows.upsert_predictions_and_send_to_project(
         name=name,
         predictions=str(file_path),
-        project_id=configured_project_without_data_rows.uid,
+        project_id=configured_project_with_one_data_row.uid,
         priority=5)
 
     assert annotation_import.model_run_id == model_run_with_data_rows.uid
@@ -128,7 +128,7 @@ def test_create_from_local_file(tmp_path, model_run_with_data_rows,
     assert annotation_import.statuses
 
     assert batch
-    assert batch.project().uid == configured_project_without_data_rows.uid
+    assert batch.project().uid == configured_project_with_one_data_row.uid
 
     assert mal_prediction_import
     mal_prediction_import.wait_until_done()
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 82c739ddb..56ce6bae1 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -21,6 +21,7 @@
 from labelbox.schema.annotation_import import LabelImport
 from labelbox.schema.enums import AnnotationImportState
 from labelbox.schema.invite import Invite
+from labelbox.schema.project import Project
 from labelbox.schema.queue_mode import QueueMode
 from labelbox.schema.user import User
 
@@ -425,16 +426,21 @@ def configured_project_with_label(client, rand_gen, image_url, project, dataset,
     Additionally includes a create_label method for any needed extra labels
     One label is already created and yielded when using fixture
     """
+    start_time = time.time()
+    project._wait_until_data_rows_are_processed(data_row_ids=[data_row.uid],
+                                                sleep_interval=3)
 
     project.create_batch(
         rand_gen(str),
         [data_row.uid],  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
     )
+    print("create_batch took: ", time.time() - start_time)
     ontology = _setup_ontology(project)
+    print("setup ontology took: ", time.time() - start_time)
     label = _create_label(project, data_row, ontology,
                           wait_for_label_processing)
-
+    print("create_label took: ", time.time() - start_time)
     yield [project, dataset, data_row, label]
 
     for label in project.labels():
@@ -817,11 +823,13 @@ def upload_invalid_data_rows_for_dataset(dataset: Dataset):
     task.wait_till_done()
 
 
+@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ)
 def pytest_configure():
     pytest.report = defaultdict(int)
     pytest.data_row_report = {'times': 0, 'num_rows': 0}
 
 
+@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ)
 @pytest.hookimpl(hookwrapper=True)
 def pytest_fixture_setup(fixturedef, request):
     start = time.time()
@@ -832,11 +840,8 @@ def pytest_fixture_setup(fixturedef, request):
     exec_time = end - start
     pytest.report[fixturedef.argname] += exec_time
 
-    # print('pytest_fixture_setup'
-    #       f', request={request}'
-    #       f', create_data_row_time={end - start}')
-
 
+@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ)
 @pytest.fixture(scope='session', autouse=True)
 def print_perf_summary():
     yield
diff --git a/tests/integration/export_v2/conftest.py b/tests/integration/export_v2/conftest.py
index af8b4c66f..757bba44e 100644
--- a/tests/integration/export_v2/conftest.py
+++ b/tests/integration/export_v2/conftest.py
@@ -297,7 +297,7 @@ def configured_project_with_ontology(client, initial_dataset, ontology,
 
 
 @pytest.fixture
-def configured_project_without_data_rows(client, ontology, rand_gen):
+def configured_project_with_one_data_row(client, ontology, rand_gen):
     project = client.create_project(name=rand_gen(str),
                                     description=rand_gen(str),
                                     queue_mode=QueueMode.Batch)
diff --git a/tests/integration/export_v2/test_export_video.py b/tests/integration/export_v2/test_export_video.py
index 863f4d31e..94828e1b6 100644
--- a/tests/integration/export_v2/test_export_video.py
+++ b/tests/integration/export_v2/test_export_video.py
@@ -5,11 +5,11 @@
 from labelbox.schema.annotation_import import AnnotationImportState
 
 
-def test_export_v2_video(client, configured_project_without_data_rows,
+def test_export_v2_video(client, configured_project_with_one_data_row,
                          video_data, video_data_row,
                          bbox_video_annotation_objects, rand_gen):
 
-    project = configured_project_without_data_rows
+    project = configured_project_with_one_data_row
     project_id = project.uid
     labels = []
     _, data_row_uids = video_data

From 65990878d68bb1c35a3c9e6f6dff27c83ba1ea40 Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Fri, 18 Aug 2023 10:58:02 -0700
Subject: [PATCH 13/20] Remove more sources of data leakage

---
 .../annotation_import/test_data_types.py      | 19 ++++++++++++++-----
 tests/integration/conftest.py                 |  2 ++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py
index 5de79f5cc..a5c27eb20 100644
--- a/tests/integration/annotation_import/test_data_types.py
+++ b/tests/integration/annotation_import/test_data_types.py
@@ -137,7 +137,8 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name):
 ])
 def test_import_data_types(client, configured_project, initial_dataset,
                            rand_gen, data_row_json_by_data_type,
-                           annotations_by_data_type, data_type_class):
+                           annotations_by_data_type, data_type_class,
+                           one_datarow):
 
     project = configured_project
     project_id = project.uid
@@ -304,14 +305,22 @@ def test_import_label_annotations(client, configured_project_with_one_data_row,
 
 
 @pytest.mark.parametrize('data_type, data_class, annotations', test_params)
-def test_import_mal_annotations(client, configured_project_with_one_data_row,
-                                data_row_json_by_data_type, data_type,
-                                data_class, annotations, rand_gen):
-
+@pytest.fixture
+def one_datarow(client, rand_gen, data_row_json_by_data_type, data_type):
     dataset = client.create_dataset(name=rand_gen(str))
     data_row_json = data_row_json_by_data_type[data_type]
     data_row = dataset.create_data_row(data_row_json)
 
+    yield data_row
+
+    dataset.delete()
+
+
+@pytest.mark.parametrize('data_type, data_class, annotations', test_params)
+def test_import_mal_annotations(client, configured_project_with_one_data_row,
+                                data_type, data_class, annotations, rand_gen,
+                                one_datarow):
+    data_row = one_datarow
     set_project_media_type_from_data_type(configured_project_with_one_data_row,
                                           data_class)
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 56ce6bae1..af66a7ed4 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -390,6 +390,8 @@ def initial_dataset(client, rand_gen):
     dataset = client.create_dataset(name=rand_gen(str))
     yield dataset
 
+    dataset.delete()
+
 
 @pytest.fixture
 def project_with_ontology(project):

From 9e41e82169da875ec0d9aee9638fa6fbca864b60 Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Fri, 18 Aug 2023 11:31:08 -0700
Subject: [PATCH 14/20] Add config for fixture profiling

---
 Makefile                                      |  1 +
 .../integration/annotation_import/conftest.py | 11 +-----
 .../annotation_import/test_data_types.py      | 13 +++++--
 tests/integration/conftest.py                 | 37 +++++++------------
 4 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/Makefile b/Makefile
index f9f490554..b7838a7d4 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,7 @@ test-local: build-image
 		-e LABELBOX_TEST_ENVIRON="local" \
 		-e DA_GCP_LABELBOX_API_KEY=${DA_GCP_LABELBOX_API_KEY} \
 		-e LABELBOX_TEST_API_KEY_LOCAL=${LABELBOX_TEST_API_KEY_LOCAL} \
+		-e FIXTURE_PROFILE=true \
 		local/labelbox-python:test pytest $(PATH_TO_TEST)
 
 test-staging: build-image
diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index 3f1cd7de5..988ad7883 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -498,7 +498,6 @@ def get_data_row_id(indx=0):
 
 @pytest.fixture
 def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
-    start_time = time.time()
     dataset = initial_dataset
     project = client.create_project(name=rand_gen(str),
                                     queue_mode=QueueMode.Batch)
@@ -515,15 +514,12 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
         num_rows += 1
     project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids,
                                                 sleep_interval=3)
-    if pytest.data_row_report:
-        pytest.data_row_report['times'] += time.time() - start_time
-        pytest.data_row_report['num_rows'] += num_rows
+
     project.create_batch(
         rand_gen(str),
         data_row_ids,  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
     )
-    print("After creating batch ", time.time() - start_time)
     project.data_row_ids = data_row_ids
 
     yield project
@@ -579,8 +575,6 @@ def dataset_conversation_entity(client, rand_gen, conversation_entity_data_row,
 @pytest.fixture
 def configured_project_with_one_data_row(client, ontology, rand_gen,
                                          initial_dataset, image_url):
-    start_time = time.time()
-
     project = client.create_project(name=rand_gen(str),
                                     description=rand_gen(str),
                                     queue_mode=QueueMode.Batch)
@@ -594,9 +588,6 @@ def configured_project_with_one_data_row(client, ontology, rand_gen,
     project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids,
                                                 sleep_interval=3)
 
-    if pytest.data_row_report:
-        pytest.data_row_report['times'] += time.time() - start_time
-        pytest.data_row_report['num_rows'] += 1
     batch = project.create_batch(
         rand_gen(str),
         data_row_ids,  # sample of data row objects
diff --git a/tests/integration/annotation_import/test_data_types.py b/tests/integration/annotation_import/test_data_types.py
index a5c27eb20..79e8b03cb 100644
--- a/tests/integration/annotation_import/test_data_types.py
+++ b/tests/integration/annotation_import/test_data_types.py
@@ -135,10 +135,15 @@ def create_data_row_for_project(project, dataset, data_row_ndjson, batch_name):
     AudioData, ConversationData, DicomData, DocumentData, HTMLData, ImageData,
     TextData
 ])
-def test_import_data_types(client, configured_project, initial_dataset,
-                           rand_gen, data_row_json_by_data_type,
-                           annotations_by_data_type, data_type_class,
-                           one_datarow):
+def test_import_data_types(
+    client,
+    configured_project,
+    initial_dataset,
+    rand_gen,
+    data_row_json_by_data_type,
+    annotations_by_data_type,
+    data_type_class,
+):
 
     project = configured_project
     project_id = project.uid
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index af66a7ed4..a6651b97d 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -27,8 +27,6 @@
 
 IMG_URL = "https://picsum.photos/200/300.jpg"
 SMALL_DATASET_URL = "https://storage.googleapis.com/lb-artifacts-testing-public/sdk_integration_test/potato.jpeg"
-DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS = 30
-DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS = 5
 
 
 class Environ(Enum):
@@ -458,10 +456,8 @@ def configured_batch_project_with_label(project, dataset, data_row,
     One label is already created and yielded when using fixture
     """
     data_rows = [dr.uid for dr in list(dataset.data_rows())]
-    project._wait_until_data_rows_are_processed(
-        data_row_ids=data_rows,
-        wait_processing_max_seconds=DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS,
-        sleep_interval=DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS)
+    project._wait_until_data_rows_are_processed(data_row_ids=data_rows,
+                                                sleep_interval=3)
     project.create_batch("test-batch", data_rows)
     project.data_row_ids = data_rows
 
@@ -604,7 +600,6 @@ def configured_project_with_complex_ontology(client, initial_dataset, rand_gen,
     project.setup(editor, ontology.asdict())
 
     yield [project, data_row]
-    dataset.delete()
     project.delete()
 
 
@@ -825,35 +820,31 @@ def upload_invalid_data_rows_for_dataset(dataset: Dataset):
     task.wait_till_done()
 
 
-@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ)
 def pytest_configure():
     pytest.report = defaultdict(int)
-    pytest.data_row_report = {'times': 0, 'num_rows': 0}
 
 
-@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ)
 @pytest.hookimpl(hookwrapper=True)
-def pytest_fixture_setup(fixturedef, request):
+def pytest_fixture_setup(fixturedef):
     start = time.time()
     yield
-
     end = time.time()
 
     exec_time = end - start
-    pytest.report[fixturedef.argname] += exec_time
+    if "FIXTURE_PROFILE" in os.environ:
+        pytest.report[fixturedef.argname] += exec_time
 
 
-@pytest.mark.skipif("FIXTURE_PROFILE" not in os.environ)
 @pytest.fixture(scope='session', autouse=True)
 def print_perf_summary():
     yield
 
-    sorted_dict = dict(
-        sorted(pytest.report.items(), key=lambda item: item[1], reverse=True))
-    num_of_entries = 10 if len(sorted_dict) >= 10 else len(sorted_dict)
-    slowest_fixtures = [
-        (aaa, sorted_dict[aaa]) for aaa in islice(sorted_dict, num_of_entries)
-    ]
-    print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr)
-    print("Data row report:\n", pytest.data_row_report, file=sys.stderr)
-    # assert False
+    if "FIXTURE_PROFILE" in os.environ:
+        sorted_dict = dict(
+            sorted(pytest.report.items(),
+                   key=lambda item: item[1],
+                   reverse=True))
+        num_of_entries = 10 if len(sorted_dict) >= 10 else len(sorted_dict)
+        slowest_fixtures = [(aaa, sorted_dict[aaa])
+                            for aaa in islice(sorted_dict, num_of_entries)]
+        print("\nTop slowest fixtures:\n", slowest_fixtures, file=sys.stderr)

From ba2990d68087b575a7f27ed6e366fab1bb7caf53 Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Fri, 18 Aug 2023 15:02:14 -0700
Subject: [PATCH 15/20] Add explanation on how to supply data row ids to
 prediction_id_mapping

---
 .../integration/annotation_import/conftest.py  | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index 988ad7883..1980d6f26 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -505,13 +505,11 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
         client.get_labeling_frontends(
             where=LabelingFrontend.name == "editor"))[0]
     project.setup(editor, ontology)
-    num_rows = 0
 
     data_row_ids = []
 
     for _ in range(len(ontology['tools']) + len(ontology['classifications'])):
         data_row_ids.append(dataset.create_data_row(row_data=image_url).uid)
-        num_rows += 1
     project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids,
                                                 sleep_interval=3)
 
@@ -605,6 +603,22 @@ def configured_project_with_one_data_row(client, ontology, rand_gen,
 # At the moment it expects only one feature per tool type and this creates unnecessary coupling between differet tests
 # In an example of a 'rectangle' we have extended to support multiple instances of the same tool type
 # TODO: we will support this approach in the future for all tools
+#
+"""
+Please note that this fixture now offers the flexibility to configure three different strategies for generating data row ids for predictions:
+Default(configured_project fixture):
+    configured_project that generates a data row for each member of ontology.
+    This makes sure each prediction has its own data row id. This is applicable to prediction upload cases when last label overwrites existing ones
+
+Optimized Strategy (configured_project_with_one_data_row fixture):
+    This fixture has only one data row and all predictions will be mapped to it
+
+Custom Data Row IDs Strategy:
+    Individuals can create their own fixture to supply data row ids. 
+    This particular fixture, termed "hardcoded_datarow_id," should be defined locally within a test file.
+"""
+
+
 @pytest.fixture
 def prediction_id_mapping(ontology, request):
     # Maps tool types to feature schema ids

From 700fefe87cb2259ea78b8994f986b641ce954cd7 Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Fri, 18 Aug 2023 15:50:42 -0700
Subject: [PATCH 16/20] Fix test_user_and_org.py

---
 tests/integration/test_user_and_org.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/integration/test_user_and_org.py b/tests/integration/test_user_and_org.py
index 9f07666de..ca158527c 100644
--- a/tests/integration/test_user_and_org.py
+++ b/tests/integration/test_user_and_org.py
@@ -1,3 +1,6 @@
+from labelbox.schema.project import Project
+
+
 def test_user(client):
     user = client.get_user()
     assert user.uid is not None
@@ -10,14 +13,11 @@ def test_organization(client):
     assert client.get_user() in set(organization.users())
 
 
-def test_user_and_org_projects(project):
-    client = project.client
+def test_user_and_org_projects(client, project):
     user = client.get_user()
     org = client.get_organization()
-    user_projects = set(user.projects())
-    org_projects = set(org.projects())
+    user_project = user.projects(where=Project.uid == project.uid)
+    org_project = org.projects(where=Project.uid == project.uid)
 
-    assert project.created_by() == user
-    assert project.organization() == org
-    assert project in user_projects
-    assert project in org_projects
\ No newline at end of file
+    assert user_project
+    assert org_project
\ No newline at end of file

From 5a6e250fba457336a78bd249c1cc22efc1973842 Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Tue, 22 Aug 2023 17:18:27 -0700
Subject: [PATCH 17/20] PR updates

---
 .../integration/annotation_import/conftest.py |  8 +++++--
 .../test_bulk_import_request.py               |  8 +++----
 tests/integration/conftest.py                 | 22 ++++++++++---------
 tests/integration/export_v2/conftest.py       |  2 +-
 .../export_v2/test_export_video.py            |  4 ++--
 tests/integration/test_filtering.py           |  4 ++--
 tests/integration/test_project.py             |  6 ++---
 7 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index 1980d6f26..ebfe74f47 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -513,7 +513,7 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
     project._wait_until_data_rows_are_processed(data_row_ids=data_row_ids,
                                                 sleep_interval=3)
 
-    project.create_batch(
+    batch = project.create_batch(
         rand_gen(str),
         data_row_ids,  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
@@ -521,6 +521,8 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
     project.data_row_ids = data_row_ids
 
     yield project
+
+    batch.delete()
     project.delete()
 
 
@@ -614,8 +616,10 @@ def configured_project_with_one_data_row(client, ontology, rand_gen,
     This fixture has only one data row and all predictions will be mapped to it
 
 Custom Data Row IDs Strategy:
-    Individuals can create their own fixture to supply data row ids. 
+    Individuals can supply hard-coded data row ids when a creation of data row is not required. 
     This particular fixture, termed "hardcoded_datarow_id," should be defined locally within a test file.
+    In the future, we can use this approach to inject correct number of rows instead of using configured_project fixture 
+        that creates a data row for each member of ontology (14 in total) for each run.
 """
 
 
diff --git a/tests/integration/annotation_import/test_bulk_import_request.py b/tests/integration/annotation_import/test_bulk_import_request.py
index 6691cc044..52552f53d 100644
--- a/tests/integration/annotation_import/test_bulk_import_request.py
+++ b/tests/integration/annotation_import/test_bulk_import_request.py
@@ -41,13 +41,13 @@ def test_create_from_url(project):
     assert bulk_import_request.state == BulkImportRequestState.RUNNING
 
 
-def test_validate_file(project_with_ontology):
+def test_validate_file(project_with_empty_ontology):
     name = str(uuid.uuid4())
     url = "https://storage.googleapis.com/labelbox-public-bucket/predictions_test_v2.ndjson"
     with pytest.raises(MALValidationError):
-        project_with_ontology.upload_annotations(name=name,
-                                                 annotations=url,
-                                                 validate=True)
+        project_with_empty_ontology.upload_annotations(name=name,
+                                                       annotations=url,
+                                                       validate=True)
         #Schema ids shouldn't match
 
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index a6651b97d..781fe6edb 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -27,6 +27,8 @@
 
 IMG_URL = "https://picsum.photos/200/300.jpg"
 SMALL_DATASET_URL = "https://storage.googleapis.com/lb-artifacts-testing-public/sdk_integration_test/potato.jpeg"
+DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS = 30
+DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS = 3
 
 
 class Environ(Enum):
@@ -392,7 +394,7 @@ def initial_dataset(client, rand_gen):
 
 
 @pytest.fixture
-def project_with_ontology(project):
+def project_with_empty_ontology(project):
     editor = list(
         project.client.get_labeling_frontends(
             where=LabelingFrontend.name == "editor"))[0]
@@ -402,13 +404,13 @@ def project_with_ontology(project):
 
 
 @pytest.fixture
-def configured_project(project_with_ontology, initial_dataset, rand_gen,
+def configured_project(project_with_empty_ontology, initial_dataset, rand_gen,
                        image_url):
     dataset = initial_dataset
     data_row_id = dataset.create_data_row(row_data=image_url).uid
-    project = project_with_ontology
+    project = project_with_empty_ontology
 
-    project.create_batch(
+    batch = project.create_batch(
         rand_gen(str),
         [data_row_id],  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
@@ -417,6 +419,8 @@ def configured_project(project_with_ontology, initial_dataset, rand_gen,
 
     yield project
 
+    batch.delete()
+
 
 @pytest.fixture
 def configured_project_with_label(client, rand_gen, image_url, project, dataset,
@@ -426,21 +430,19 @@ def configured_project_with_label(client, rand_gen, image_url, project, dataset,
     Additionally includes a create_label method for any needed extra labels
     One label is already created and yielded when using fixture
     """
-    start_time = time.time()
-    project._wait_until_data_rows_are_processed(data_row_ids=[data_row.uid],
-                                                sleep_interval=3)
+    project._wait_until_data_rows_are_processed(
+        data_row_ids=[data_row.uid],
+        wait_processing_max_seconds=DATA_ROW_PROCESSING_WAIT_TIMEOUT_SECONDS,
+        sleep_interval=DATA_ROW_PROCESSING_WAIT_SLEEP_INTERNAL_SECONDS)
 
     project.create_batch(
         rand_gen(str),
         [data_row.uid],  # sample of data row objects
         5  # priority between 1(Highest) - 5(lowest)
     )
-    print("create_batch took: ", time.time() - start_time)
     ontology = _setup_ontology(project)
-    print("setup ontology took: ", time.time() - start_time)
     label = _create_label(project, data_row, ontology,
                           wait_for_label_processing)
-    print("create_label took: ", time.time() - start_time)
     yield [project, dataset, data_row, label]
 
     for label in project.labels():
diff --git a/tests/integration/export_v2/conftest.py b/tests/integration/export_v2/conftest.py
index 757bba44e..af8b4c66f 100644
--- a/tests/integration/export_v2/conftest.py
+++ b/tests/integration/export_v2/conftest.py
@@ -297,7 +297,7 @@ def configured_project_with_ontology(client, initial_dataset, ontology,
 
 
 @pytest.fixture
-def configured_project_with_one_data_row(client, ontology, rand_gen):
+def configured_project_without_data_rows(client, ontology, rand_gen):
     project = client.create_project(name=rand_gen(str),
                                     description=rand_gen(str),
                                     queue_mode=QueueMode.Batch)
diff --git a/tests/integration/export_v2/test_export_video.py b/tests/integration/export_v2/test_export_video.py
index 94828e1b6..863f4d31e 100644
--- a/tests/integration/export_v2/test_export_video.py
+++ b/tests/integration/export_v2/test_export_video.py
@@ -5,11 +5,11 @@
 from labelbox.schema.annotation_import import AnnotationImportState
 
 
-def test_export_v2_video(client, configured_project_with_one_data_row,
+def test_export_v2_video(client, configured_project_without_data_rows,
                          video_data, video_data_row,
                          bbox_video_annotation_objects, rand_gen):
 
-    project = configured_project_with_one_data_row
+    project = configured_project_without_data_rows
     project_id = project.uid
     labels = []
     _, data_row_uids = video_data
diff --git a/tests/integration/test_filtering.py b/tests/integration/test_filtering.py
index fde7f0638..f44cdcdcb 100644
--- a/tests/integration/test_filtering.py
+++ b/tests/integration/test_filtering.py
@@ -15,7 +15,7 @@ def project_to_test_where(client, rand_gen):
     p_b = client.create_project(name=p_b_name, queue_mode=QueueMode.Batch)
     p_c = client.create_project(name=p_c_name, queue_mode=QueueMode.Batch)
 
-    yield p_a, p_b, p_c
+    yield p_a, p_b
 
     p_a.delete()
     p_b.delete()
@@ -26,7 +26,7 @@ def project_to_test_where(client, rand_gen):
 # other builds simultaneously adding projects to test org
 def test_where(client, project_to_test_where):
     p_a, p_b, p_c = project_to_test_where
-    p_a_name, p_b_name, _ = [p.name for p in [p_a, p_b, p_c]]
+    p_a_name, p_b_name = [p.name for p in [p_a, p_b]]
 
     def get(where=None):
         date_where = Project.created_at >= p_a.created_at
diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py
index b3b683a3d..94c98ee50 100644
--- a/tests/integration/test_project.py
+++ b/tests/integration/test_project.py
@@ -171,15 +171,15 @@ def test_attach_instructions(client, project):
 
 @pytest.mark.skipif(condition=os.environ['LABELBOX_TEST_ENVIRON'] == "onprem",
                     reason="new mutation does not work for onprem")
-def test_html_instructions(project_with_ontology):
+def test_html_instructions(project_with_empty_ontology):
     html_file_path = '/tmp/instructions.html'
     sample_html_str = "<html></html>"
 
     with open(html_file_path, 'w') as file:
         file.write(sample_html_str)
 
-    project_with_ontology.upsert_instructions(html_file_path)
-    updated_ontology = project_with_ontology.ontology().normalized
+    project_with_empty_ontology.upsert_instructions(html_file_path)
+    updated_ontology = project_with_empty_ontology.ontology().normalized
 
     instructions = updated_ontology.pop('projectInstructions')
     assert requests.get(instructions).text == sample_html_str

From e585e8c14c89503a99319c342f80903bc5eeeaec Mon Sep 17 00:00:00 2001
From: Val Brodsky <vbrodsky@labelbox.com>
Date: Tue, 22 Aug 2023 17:35:52 -0700
Subject: [PATCH 18/20] Turn on fixture profile for staging

---
 .github/workflows/python-package.yml            | 1 +
 tests/integration/annotation_import/conftest.py | 1 -
 tests/integration/test_filtering.py             | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 6ed378f09..83c0393af 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -39,6 +39,7 @@ jobs:
             echo "LABELBOX_TEST_ENVIRON=prod" >> $GITHUB_ENV
           else
             echo "LABELBOX_TEST_ENVIRON=staging" >> $GITHUB_ENV
+            echo "FIXTURE_PROFILE=true" >> $GITHUB_ENV
           fi
 
       - uses: actions/checkout@v2
diff --git a/tests/integration/annotation_import/conftest.py b/tests/integration/annotation_import/conftest.py
index ebfe74f47..d50c44d0c 100644
--- a/tests/integration/annotation_import/conftest.py
+++ b/tests/integration/annotation_import/conftest.py
@@ -522,7 +522,6 @@ def configured_project(client, initial_dataset, ontology, rand_gen, image_url):
 
     yield project
 
-    batch.delete()
     project.delete()
 
 
diff --git a/tests/integration/test_filtering.py b/tests/integration/test_filtering.py
index f44cdcdcb..7dd687759 100644
--- a/tests/integration/test_filtering.py
+++ b/tests/integration/test_filtering.py
@@ -15,7 +15,7 @@ def project_to_test_where(client, rand_gen):
     p_b = client.create_project(name=p_b_name, queue_mode=QueueMode.Batch)
     p_c = client.create_project(name=p_c_name, queue_mode=QueueMode.Batch)
 
-    yield p_a, p_b
+    yield p_a, p_b, p_c
 
     p_a.delete()
     p_b.delete()

From becff6637314e214630facbae15c29c7d89e14e1 Mon Sep 17 00:00:00 2001
From: Richard Sun <richardsun0713@users.noreply.github.com>
Date: Wed, 23 Aug 2023 10:46:32 -0700
Subject: [PATCH 19/20] [QQC-2355] Limit number of data rows to check for
 processing status at once (#1218)

---
 labelbox/schema/project.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py
index addb8c10b..bd912aa0b 100644
--- a/labelbox/schema/project.py
+++ b/labelbox/schema/project.py
@@ -1454,12 +1454,35 @@ def _wait_until_data_rows_are_processed(
         """ Wait until all the specified data rows are processed"""
         start_time = datetime.now()
 
+        max_data_rows_per_poll = 100_000
+        if data_row_ids is not None:
+            for i in range(0, len(data_row_ids), max_data_rows_per_poll):
+                chunk = data_row_ids[i:i + max_data_rows_per_poll]
+                self._poll_data_row_processing_status(
+                    chunk, [], start_time, wait_processing_max_seconds,
+                    sleep_interval)
+
+        if global_keys is not None:
+            for i in range(0, len(global_keys), max_data_rows_per_poll):
+                chunk = global_keys[i:i + max_data_rows_per_poll]
+                self._poll_data_row_processing_status(
+                    [], chunk, start_time, wait_processing_max_seconds,
+                    sleep_interval)
+
+    def _poll_data_row_processing_status(
+            self,
+            data_row_ids: List[str],
+            global_keys: List[str],
+            start_time: datetime,
+            wait_processing_max_seconds: int = _wait_processing_max_seconds,
+            sleep_interval=30):
+
         while True:
             if (datetime.now() -
                     start_time).total_seconds() >= wait_processing_max_seconds:
                 raise ProcessingWaitTimeout(
-                    "Maximum wait time exceeded while waiting for data rows to be processed. Try creating a batch a bit later"
-                )
+                    """Maximum wait time exceeded while waiting for data rows to be processed. 
+                    Try creating a batch a bit later""")
 
             all_good = self.__check_data_rows_have_been_processed(
                 data_row_ids, global_keys)

From c4bdab4b005a9f5ecd48c956ff9884e5a8577121 Mon Sep 17 00:00:00 2001
From: Klaus Opreschko <kopreschko@labelbox.com>
Date: Thu, 24 Aug 2023 10:14:34 -0600
Subject: [PATCH 20/20] prep for release

---
 CHANGELOG.md         | 4 ++++
 docs/source/conf.py  | 2 +-
 labelbox/__init__.py | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49969ccc9..c28fb8f1e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,8 @@
 # Changelog
+# Version 3.52.0 (2023-08-24)
+## Added
+* Added methods to create multiple batches for a project from a list of data rows
+* Limit the number of data rows to be checked for processing status
 # Version 3.51.0 (2023-08-14)
 ## Added
 * Added global keys to export v2 filters for project, dataset and DataRow
diff --git a/docs/source/conf.py b/docs/source/conf.py
index af3413148..dd6ee3a1e 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -21,7 +21,7 @@
 copyright = '2021, Labelbox'
 author = 'Labelbox'
 
-release = '3.51.0'
+release = '3.52.0'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/labelbox/__init__.py b/labelbox/__init__.py
index 1dba57cf5..7a9efc5e6 100644
--- a/labelbox/__init__.py
+++ b/labelbox/__init__.py
@@ -1,5 +1,5 @@
 name = "labelbox"
-__version__ = "3.51.0"
+__version__ = "3.52.0"
 
 from labelbox.client import Client
 from labelbox.schema.project import Project