Skip to content

Commit 3b70de3

Browse files
authored
Merge pull request #179 from Labelbox/ms/export-dataset
bulk export datasets
2 parents ba951a0 + bf9d0a6 commit 3b70de3

File tree

5 files changed

+60
-6
lines changed

5 files changed

+60
-6
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
# Next Release:
4+
* Added `dataset.export_data_rows()` which returns all `DataRows` for a `Dataset`.
5+
36
# Version 2.6.0 (2021-06-11)
47
## Fix
58
* Upated `create_mask_ndjson` helper function in `image_mal.ipynb` to use the color arg

labelbox/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name = "labelbox"
2-
__version__ = "2.6.0"
2+
__version__ = "2.7.0"
33

44
from labelbox.client import Client
55
from labelbox.schema.bulk_import_request import BulkImportRequest

labelbox/schema/dataset.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@
33
import logging
44
from itertools import islice
55
from multiprocessing.dummy import Pool as ThreadPool
6+
import time
7+
import ndjson
8+
from io import StringIO
9+
import requests
610

7-
from labelbox.exceptions import InvalidQueryError, ResourceNotFoundError, InvalidAttributeError
11+
from labelbox.exceptions import InvalidQueryError, LabelboxError, ResourceNotFoundError, InvalidAttributeError
812
from labelbox.orm.db_object import DbObject, Updateable, Deletable
913
from labelbox.orm.model import Entity, Field, Relationship
1014

@@ -75,15 +79,15 @@ def create_data_rows(self, items):
7579
is uploaded to Labelbox and a DataRow referencing it is created.
7680
7781
If an item is a `dict`, then it could support one of the two following structures
78-
1. For static imagery, video, and text it should map `DataRow` fields (or their names) to values.
82+
1. For static imagery, video, and text it should map `DataRow` fields (or their names) to values.
7983
At the minimum an `item` passed as a `dict` must contain a `DataRow.row_data` key and value.
8084
2. For tiled imagery the dict must match the import structure specified in the link below
8185
https://docs.labelbox.com/data-model/en/index-en#tiled-imagery-import
82-
86+
8387
>>> dataset.create_data_rows([
8488
>>> {DataRow.row_data:"http://my_site.com/photos/img_01.jpg"},
8589
>>> "path/to/file2.jpg",
86-
>>> {"tileLayerUrl" : "http://", ...}
90+
>>> {"tileLayerUrl" : "http://", ...}
8791
>>> ])
8892
8993
For an example showing how to upload tiled data_rows see the following notebook:
@@ -227,3 +231,41 @@ def data_row_for_external_id(self, external_id):
227231
f"More than one data_row has the provided external_id : `%s`. Use function data_rows_for_external_id to fetch all",
228232
external_id)
229233
return data_rows[0]
234+
235+
def export_data_rows(self, timeout_seconds=120):
236+
""" Returns a generator that produces all data rows that are currently attached to this dataset.
237+
238+
Args:
239+
timeout_seconds (float): Max waiting time, in seconds.
240+
Returns:
241+
Generator that yields DataRow objects belonging to this dataset.
242+
Raises:
243+
LabelboxError: if the export fails or is unable to download within the specified time.
244+
"""
245+
id_param = "datasetId"
246+
query_str = """mutation GetDatasetDataRowsExportUrlPyApi($%s: ID!)
247+
{exportDatasetDataRows(data:{datasetId: $%s }) {downloadUrl createdAt status}}
248+
""" % (id_param, id_param)
249+
sleep_time = 2
250+
while True:
251+
res = self.client.execute(query_str, {id_param: self.uid})
252+
res = res["exportDatasetDataRows"]
253+
if res["status"] == "COMPLETE":
254+
download_url = res["downloadUrl"]
255+
response = requests.get(download_url)
256+
response.raise_for_status()
257+
reader = ndjson.reader(StringIO(response.text))
258+
return (
259+
Entity.DataRow(self.client, result) for result in reader)
260+
elif res["status"] == "FAILED":
261+
raise LabelboxError("Data row export failed.")
262+
263+
timeout_seconds -= sleep_time
264+
if timeout_seconds <= 0:
265+
raise LabelboxError(
266+
f"Unable to export data rows within {timeout_seconds} seconds."
267+
)
268+
269+
logger.debug("Dataset '%s' data row export, waiting for server...",
270+
self.uid)
271+
time.sleep(sleep_time)

tests/integration/test_dataset.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,13 @@ def test_upload_video_file(dataset, sample_video: str) -> None:
9797
response = requests.head(url, allow_redirects=True)
9898
assert int(response.headers['Content-Length']) == content_length
9999
assert response.headers['Content-Type'] == 'video/mp4'
100+
101+
102+
def test_data_row_export(dataset):
103+
n_data_rows = 5
104+
ids = set()
105+
for _ in range(n_data_rows):
106+
ids.add(dataset.create_data_row(row_data=IMG_URL))
107+
result = list(dataset.export_data_rows())
108+
assert len(result) == n_data_rows
109+
assert set(result) == ids

tests/integration/test_relationships.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from labelbox.exceptions import InvalidQueryError
44

55

6-
@pytest.mark.skip("api bug is causing this to break. This is being addressed")
76
def test_project_dataset(client, rand_gen):
87
project = client.create_project(name=rand_gen(str))
98
dataset = client.create_dataset(name=rand_gen(str))

0 commit comments

Comments
 (0)