Skip to content

Commit

Permalink
Merge pull request #268 from Labelbox/develop
Browse files Browse the repository at this point in the history
3.2.1
  • Loading branch information
msokoloff1 authored Sep 1, 2021
2 parents 55f4d7b + c9c66b7 commit c33fc04
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

# Version 3.2.1 (2021-08-31)
## Fix
* Resolved issue with `create_data_rows()` was not working on amazon linux

# Version 3.2.0 (2021-08-26)
## Added
* List `BulkImportRequest`s for a project with `Project.bulk_import_requests()`
Expand Down
1 change: 1 addition & 0 deletions examples/integrations/databricks/api_key_db_template.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["#We recommend using Databricks Secrets API to create a variable for your Labelbox API Key, but if you do not have access to the Secrets API you can use this notebook template to store your API key in a separate notebook. Be sure to include in gitignore to avoid committing your API key to Git. \n\napi_key = \"Paste your Labelbox API key here\"\n\ndbutils.notebook.exit(api_key) #returns api_key if you call this notebook via a notebook workflow\n\n###example code for notebook workflow w/ dbutils will get api_key from notebook \"api_key\"\n# try: API_KEY\n# except NameError: \n# API_KEY = dbutils.notebook.run(\"api_key\", 60) \n\n\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5eed077b-aca5-4577-a500-7691c2db0d3a"}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"api_key_db_template","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":1214062433632427}},"nbformat":4,"nbformat_minor":0}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"markdown","source":["##Notebook Setup##"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bd1df061-02be-4232-8016-4aeb27fd7691"}}},{"cell_type":"code","source":["from labelbox import Client\nimport databricks.koalas as pd\nimport labelspark\n\ntry: API_KEY\nexcept NameError: \n API_KEY = dbutils.notebook.run(\"api_key\", 60)\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Project Setup","showTitle":false,"inputWidgets":{},"nuid":"acdfe8c6-1d76-46af-a945-162f5c8a1e26"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["client = Client(API_KEY)\n\nprojects = client.get_projects()\nfor project in projects:\n print(project.name, project.uid)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Check Successful API Connection w/ Labelbox SDK ","showTitle":true,"inputWidgets":{},"nuid":"f6425a49-35f1-4476-9180-3a1e5894e9f9"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["# can parse the directory and make a Spark table of image URLs\n\ndef create_unstructured_dataset(): \n print(\"Creating table of unstructured image data\")\n # Pull information from Data Lake or other storage \n dataSet = client.get_dataset(\"ckolyi9ha7h800y7i5ppr3put\") #Insert Dataset ID from Labelbox for a sample dataset \n\n #creates a list of datarow dictionaries \n df_list = [ {\n \"external_id\": dataRow.external_id,\n \"row_data\": dataRow.row_data\n } for dataRow in dataSet.data_rows()]\n\n # Create DataFrame \n images = pd.DataFrame(df_list)\n df_images = images.to_spark()\n# display(df_images)\n df_images.registerTempTable(\"unstructured_data\")\n # df_images = spark.createDataFrame(images) \n\ntable_exists = False \ntblList = spark.catalog.listTables()\nif len(tblList) == 0: \n create_unstructured_dataset()\n table_exists = True\n\nfor table in tblList: \n if table.name == \"unstructured_data\": \n print(\"Unstructured data table exists\")\n table_exists = True\n\nif not table_exists: create_unstructured_dataset()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Demo-Prep: Load demo table of images and URLs","showTitle":true,"inputWidgets":{},"nuid":"bfaaf86c-3497-4096-a04e-95fa1bb3a576"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["##Load Unstructured Data##"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9a6da27e-62c9-48cd-a8a5-738ba8416df3"}}},{"cell_type":"code","source":["%sql \n\nselect * from unstructured_data"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7a27962a-08ee-433c-95d0-64457269c137"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["from labelbox import Client\nclient = Client(API_KEY)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Create Labelbox Client ","showTitle":true,"inputWidgets":{},"nuid":"59d07ff3-e06b-4848-b100-41daa6c80086"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["LabelSpark expects a spark table with two columns; the first column \"external_id\" and second column \"row_data\"\n\nexternal_id is a filename, like \"birds.jpg\" or \"my_video.mp4\"\n\nrow_data is the URL path to the file. Labelbox renders assets locally on your users' machines when they label, so your labeler will need permission to access that asset. \n\nExample: \n\n| external_id | row_data |\n|-------------|--------------------------------------|\n| image1.jpg | https://url_to_your_asset/image1.jpg |\n| image2.jpg | https://url_to_your_asset/image2.jpg |\n| image3.jpg | https://url_to_your_asset/image3.jpg |"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b186a2ad-0640-435e-88e8-a94b7439b3c3"}}},{"cell_type":"code","source":["import labelspark\nunstructured_data = spark.table(\"unstructured_data\")\ndataSet_new = labelspark.create_dataset(client, unstructured_data, \"Demo Dataset\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Create Dataset with Labelbox for Annotation","showTitle":true,"inputWidgets":{},"nuid":"e711f5c6-c99f-449d-83ef-4de53b5c11d4"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["You can use the labelbox SDK to build your ontology. An example is provided below. \n\nPlease refer to documentation at https://docs.labelbox.com/python-sdk/en/index-en"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d15d1331-c969-4b8f-bc8c-04c0756fb2d2"}}},{"cell_type":"code","source":["from labelbox.schema.ontology import OntologyBuilder, Tool, Classification, Option\n# from labelbox import Client\n# import os\n\nontology = OntologyBuilder()\ntool_people = Tool(tool=Tool.Type.BBOX, name=\"People\")\ntool_car = Tool(tool=Tool.Type.SEGMENTATION, name=\"Car\")\ntool_umbrella = Tool(tool=Tool.Type.POLYGON, name=\"Umbrella\")\nWeather_Classification = Classification(class_type=Classification.Type.RADIO, instructions=\"Weather\", \n options=[Option(value=\"Clear\"), \n Option(value=\"Overcast\"),\n Option(value=\"Rain\"),\n Option(value=\"Other\")])\nTime_of_Day = Classification(class_type=Classification.Type.RADIO, instructions=\"Time of Day\", \n options=[Option(value=\"Day\"),\n Option(value=\"Night\"),\n Option(value=\"Unknown\")])\n\nontology.add_tool(tool_people)\nontology.add_tool(tool_car)\nontology.add_tool(tool_umbrella)\nontology.add_classification(Weather_Classification)\nontology.add_classification(Time_of_Day)\n\n\nproject_demo2 = client.create_project(name=\"LabelSpark Demo Example\", description = \"Example description here.\")\nproject_demo2.datasets.connect(dataSet_new)\n\n# Setup frontends \nall_frontends = list(client.get_labeling_frontends())\nfor frontend in all_frontends:\n if frontend.name == 'Editor':\n project_frontend = frontend\n break\n\n# Attach Frontends\nproject_demo2.labeling_frontend.connect(project_frontend) \n# Attach Project and Ontology\nproject_demo2.setup(project_frontend, ontology.asdict()) \n\n\nprint(\"Project Setup is complete.\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Set Up Your Ontology with OntologyBuilder ","showTitle":true,"inputWidgets":{},"nuid":"a0eca58e-f98e-429e-95d9-74f67cf1464d"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["##Bronze and Silver Annotation Tables##"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"19634126-4ccd-4372-a459-9db511785a22"}}},{"cell_type":"markdown","source":["Be sure to provide your Labelbox Project ID (a long string like \"ckolzeshr7zsy0736w0usbxdy\") to labelspark get_annotations method to pull in your labeled dataset. \n\n<br>bronze_table = labelspark.get_annotations(client,\"ckolzeshr7zsy0736w0usbxdy\", spark, sc) \n\n*These other methods transform the bronze table and do not require a project ID.* \n<br>flattened_bronze_table = labelspark.flatten_bronze_table(bronze_table)\n<br>silver_table = labelspark.bronze_to_silver(bronze_table)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5539e072-46e7-457f-a523-a9ed98365918"}}},{"cell_type":"code","source":["client = Client(API_KEY) #refresh client \nbronze_table = labelspark.get_annotations(client,\"ckolzeshr7zsy0736w0usbxdj\", spark, sc) #insert your unique project ID here\nbronze_table.registerTempTable(\"street_photo_demo\")\ndisplay(bronze_table.limit(2))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Query Labelbox for Raw Annotations (Bronze Table)","showTitle":true,"inputWidgets":{},"nuid":"09bab9cb-9271-4029-af20-df9bab287c72"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["client = Client(API_KEY) #refresh client \nbronze_table = spark.table(\"street_photo_demo\")\nflattened_bronze_table = labelspark.flatten_bronze_table(bronze_table)\ndisplay(flattened_bronze_table.limit(1))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Bronze Table II (Labels Flattened ) ","showTitle":true,"inputWidgets":{},"nuid":"672275c2-5150-40e2-ba04-9ef4f0c3f0d4"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["client = Client(API_KEY) #refresh client \nsilver_table = labelspark.bronze_to_silver(bronze_table)\nsilver_table.registerTempTable(\"silver_table\")\ndisplay(silver_table)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Silver Table","showTitle":true,"inputWidgets":{},"nuid":"be18cfad-08b1-4067-8fbd-1ff44f55150c"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["%sql \n\nSELECT * FROM silver_table \nWHERE `People.count` > 0 \nAND `Umbrella.count` > 0\nAND `Car.count` > 0\nAND Weather = \"Rain\""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Sample Query","showTitle":true,"inputWidgets":{},"nuid":"3faa6015-5d93-4431-bad9-99e9d44d6111"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["%sql \n\nSELECT * FROM silver_table\nWHERE `People.count` > 10"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"50cc83b9-9f51-4467-a19c-672f0f302e74"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["def cleanup(): \n client = Client(API_KEY)\n dataSet_new.delete()\n project_demo2.delete()\n\ncleanup() "],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Demo Cleanup Code: Deleting Dataset and Projects","showTitle":true,"inputWidgets":{},"nuid":"2f42b1f3-8ed7-4e67-a539-6dffe6ece784"}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["### How To Get Video Project Annotations\n\nBecause Labelbox Video projects can contain multiple videos, you must use the `get_videoframe_annotations` method to return an array of DataFrames for each video in your project. Each DataFrame contains frame-by-frame annotation for a video in the project: \n\n```\nbronze_video = labelspark.get_annotations(client,\"labelbox_video_project_id_here\", spark, sc) \nvideo_dataframes = labelspark.get_videoframe_annotations(bronze_video, API_KEY, spark, sc) #note this extra step for video projects \n```\nYou may use standard LabelSpark methods iteratively to create your flattened bronze tables and silver tables: \n```\nflattened_bronze_video_dataframes = []\nsilver_video_dataframes = [] \nfor frameset in video_dataframes: \n flattened_bronze_video_dataframes.append(labelspark.flatten_bronze_table(frameset))\n silver_video_dataframes.append(labelspark.bronze_to_silver(frameset))\n```\nThis is how you would display the first video's frames and annotations, in sorted order: \n```\ndisplay(silver_video_dataframes[0]\n .join(bronze_video, [\"DataRow ID\"], \"inner\")\n .orderBy('frameNumber'), ascending = False)\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"Using the Connector to get Video Annotations","showTitle":true,"inputWidgets":{},"nuid":"bc419c6f-78b0-4a2d-8339-70431a72b582"}}},{"cell_type":"markdown","source":["While using LabelSpark, you will likely also use the Labelbox SDK (e.g. for programmatic ontology creation). These resources will help familiarize you with the Labelbox Python SDK: \n* [Visit our docs](https://labelbox.com/docs/python-api) to learn how the SDK works\n* Checkout our [notebook examples](https://github.com/Labelbox/labelspark/tree/master/notebooks) to follow along with interactive tutorials\n* view our [API reference](https://labelbox.com/docs/python-api/api-reference)."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"More Info","showTitle":true,"inputWidgets":{},"nuid":"8eaf3897-b0a8-44e9-aea4-fe58d5ee246f"}}}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"labelbox_databricks_example","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":1214062433632428}},"nbformat":4,"nbformat_minor":0}
21 changes: 21 additions & 0 deletions examples/integrations/databricks/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Databricks + Labelbox

##### Use the Labelbox Connector to easily work with unstructured data in Databricks

--------


#### [Tutorial Notebook](labelbox_databricks_example.ipynb)
* Load DataFrame of unstructured data (URLs to video, images, or text)
* Create the dataset in Labelbox
* Annotate in Labelbox
* Load annotations into Databricks for easy querying and model training

#### [API Key Notebook](api_key_db_template.ipynb)
* This is a helper notebook for users without access to the Databricks Secrets API
* Allows you to store your Labelbox API key outside of your main notebook, for better security
* We do recommend you use the Secrets API whenever possible

More information about the Connector is available on [PyPI](https://pypi.org/project/labelspark/)

[Connector Source Code](https://github.com/Labelbox/labelspark/)
2 changes: 1 addition & 1 deletion labelbox/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = "labelbox"
__version__ = "3.2.0"
__version__ = "3.2.1"

from labelbox.schema.project import Project
from labelbox.client import Client
Expand Down
4 changes: 2 additions & 2 deletions labelbox/schema/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def create_data_rows(self, items):
"""
file_upload_thread_count = 20
DataRow = Entity.DataRow
AssetAttachment = Entity.AssetAttachment

def upload_if_necessary(item):
row_data = item['row_data']
Expand All @@ -135,8 +136,7 @@ def validate_attachments(item):
if attachments:
if isinstance(attachments, list):
for attachment in attachments:
Entity.AssetAttachment.validate_attachment_json(
attachment)
AssetAttachment.validate_attachment_json(attachment)
else:
raise ValueError(
f"Attachments must be a list. Found {type(attachments)}"
Expand Down
8 changes: 4 additions & 4 deletions tests/integration/test_data_row_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_bulk_partial_delete_datarow_metadata(datarow, mdo):
assert len(datarow.metadata["fields"]) == (n_fields + 4)


@pytest.mark.slow
@pytest.mark.skip
def test_large_bulk_delete_datarow_metadata(big_dataset, mdo):
metadata = []
n_fields_start = 0
Expand All @@ -140,7 +140,7 @@ def test_large_bulk_delete_datarow_metadata(big_dataset, mdo):
assert len(errors) == 0

deletes = []
for dr in big_dataset.export_data_rows():
for dr in big_dataset.data_rows():
deletes.append(
DeleteDataRowMetadata(
data_row_id=dr.uid,
Expand All @@ -151,8 +151,8 @@ def test_large_bulk_delete_datarow_metadata(big_dataset, mdo):

errors = mdo.bulk_delete(deletes)
assert len(errors) == 0
for dr in big_dataset.export_data_rows():
assert len(dr.metadata["fields"]) == 1 + n_fields_start
for dr in big_dataset.data_rows():
assert len(dr.metadata["fields"]) == n_fields_start
break


Expand Down

0 comments on commit c33fc04

Please sign in to comment.