|
3 | 3 | import logging
|
4 | 4 | from itertools import islice
|
5 | 5 | from multiprocessing.dummy import Pool as ThreadPool
|
| 6 | +import time |
| 7 | +import ndjson |
| 8 | +from io import StringIO |
| 9 | +import requests |
6 | 10 |
|
7 |
| -from labelbox.exceptions import InvalidQueryError, ResourceNotFoundError, InvalidAttributeError |
| 11 | +from labelbox.exceptions import InvalidQueryError, LabelboxError, ResourceNotFoundError, InvalidAttributeError |
8 | 12 | from labelbox.orm.db_object import DbObject, Updateable, Deletable
|
9 | 13 | from labelbox.orm.model import Entity, Field, Relationship
|
10 | 14 |
|
@@ -75,15 +79,15 @@ def create_data_rows(self, items):
|
75 | 79 | is uploaded to Labelbox and a DataRow referencing it is created.
|
76 | 80 |
|
77 | 81 | If an item is a `dict`, then it could support one of the two following structures
|
78 |
| - 1. For static imagery, video, and text it should map `DataRow` fields (or their names) to values. |
| 82 | + 1. For static imagery, video, and text it should map `DataRow` fields (or their names) to values. |
79 | 83 | At the minimum an `item` passed as a `dict` must contain a `DataRow.row_data` key and value.
|
80 | 84 | 2. For tiled imagery the dict must match the import structure specified in the link below
|
81 | 85 | https://docs.labelbox.com/data-model/en/index-en#tiled-imagery-import
|
82 |
| - |
| 86 | +
|
83 | 87 | >>> dataset.create_data_rows([
|
84 | 88 | >>> {DataRow.row_data:"http://my_site.com/photos/img_01.jpg"},
|
85 | 89 | >>> "path/to/file2.jpg",
|
86 |
| - >>> {"tileLayerUrl" : "http://", ...} |
| 90 | + >>> {"tileLayerUrl" : "http://", ...} |
87 | 91 | >>> ])
|
88 | 92 |
|
89 | 93 | For an example showing how to upload tiled data_rows see the following notebook:
|
@@ -227,3 +231,41 @@ def data_row_for_external_id(self, external_id):
|
227 | 231 | f"More than one data_row has the provided external_id : `%s`. Use function data_rows_for_external_id to fetch all",
|
228 | 232 | external_id)
|
229 | 233 | return data_rows[0]
|
| 234 | + |
| 235 | + def export_data_rows(self, timeout_seconds=120): |
| 236 | + """ Returns a generator that produces all data rows that are currently attached to this dataset. |
| 237 | +
|
| 238 | + Args: |
| 239 | + timeout_seconds (float): Max waiting time, in seconds. |
| 240 | + Returns: |
| 241 | + Generator that yields DataRow objects belonging to this dataset. |
| 242 | + Raises: |
| 243 | + LabelboxError: if the export fails or is unable to download within the specified time. |
| 244 | + """ |
| 245 | + id_param = "datasetId" |
| 246 | + query_str = """mutation GetDatasetDataRowsExportUrlPyApi($%s: ID!) |
| 247 | + {exportDatasetDataRows(data:{datasetId: $%s }) {downloadUrl createdAt status}} |
| 248 | + """ % (id_param, id_param) |
| 249 | + sleep_time = 2 |
| 250 | + while True: |
| 251 | + res = self.client.execute(query_str, {id_param: self.uid}) |
| 252 | + res = res["exportDatasetDataRows"] |
| 253 | + if res["status"] == "COMPLETE": |
| 254 | + download_url = res["downloadUrl"] |
| 255 | + response = requests.get(download_url) |
| 256 | + response.raise_for_status() |
| 257 | + reader = ndjson.reader(StringIO(response.text)) |
| 258 | + return ( |
| 259 | + Entity.DataRow(self.client, result) for result in reader) |
| 260 | + elif res["status"] == "FAILED": |
| 261 | + raise LabelboxError("Data row export failed.") |
| 262 | + |
| 263 | + timeout_seconds -= sleep_time |
| 264 | + if timeout_seconds <= 0: |
| 265 | + raise LabelboxError( |
| 266 | + f"Unable to export data rows within {timeout_seconds} seconds." |
| 267 | + ) |
| 268 | + |
| 269 | + logger.debug("Dataset '%s' data row export, waiting for server...", |
| 270 | + self.uid) |
| 271 | + time.sleep(sleep_time) |
0 commit comments