Skip to content

Commit

Permalink
Merge branch 'master' into fix-openapi-schema-spec-gen
Browse files Browse the repository at this point in the history
  • Loading branch information
david-leifker authored Feb 5, 2025
2 parents 6e90440 + 23a86fd commit b53016e
Show file tree
Hide file tree
Showing 20 changed files with 4,916 additions and 54 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/docker-unified.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1274,6 +1274,11 @@ jobs:
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- name: Upload test results to Codecov
if: ${{ !cancelled() }}
uses: codecov/test-results-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
deploy_datahub_head:
name: Deploy to Datahub HEAD
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,20 @@ export const LinkList = ({ refetch }: LinkListProps) => {
}
};

const onConfirmDelete = (link) => {
Modal.confirm({
title: `Delete Link '${link?.description}'`,
content: `Are you sure you want to remove this Link?`,
onOk() {
handleDeleteLink(link);
},
onCancel() {},
okText: 'Yes',
maskClosable: true,
closable: true,
});
};

return entityData ? (
<>
<Modal
Expand Down Expand Up @@ -162,7 +176,7 @@ export const LinkList = ({ refetch }: LinkListProps) => {
<Button onClick={() => handleEditLink(link)} type="text" shape="circle">
<EditOutlined />
</Button>
<Button onClick={() => handleDeleteLink(link)} type="text" shape="circle" danger>
<Button onClick={() => onConfirmDelete(link)} type="text" shape="circle" danger>
<DeleteOutlined />
</Button>
</>
Expand Down
23 changes: 23 additions & 0 deletions docs-website/adoptionStoriesIndexes.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@
"imageUrl": "/img/logos/companies/snap.png",
"imageSize": "small"
},
{
"name": "Slack",
"slug": "slack",
"imageUrl": "/img/logos/companies/slack.png",
"imageSize": "large",
"link": "https://youtu.be/G5B0W03dvuU",
"linkType": "video",
"tagline": "Why Slack chose DataHub to solve lineage and discovery.",
"category": "B2B & B2C",
"description": "Slack adopted DataHub to track their entire data landscape, build lineage, and add rich context to metadata for search & discovery."
},
{
"name": "Airtel",
"slug": "airtel",
Expand Down Expand Up @@ -161,6 +172,18 @@
"category": "Financial & Fintech",
"description": "Discover how Checkout leverage DataHub for advanced data management and compliance, especially in managing sensitive data types."
},
{
"name": "Etsy",
"slug": "etsy",
"imageUrl": "/img/logos/companies/etsy.png",
"imageSize": "medium",
"link": "https://youtu.be/kLe_xfTR_rM",
"linkType": "video",
"tagline": "Why Etsy used DataHub to solve their Data Discovery needs.",
"category": "E-Commerce",
"platform": "cloud",
"description": "Etsy leverages DataHub to solve their data discovery needs, enabling their data teams to find, understand, and trust their data."
},
{
"name": "MYOB",
"slug": "myob",
Expand Down
2 changes: 1 addition & 1 deletion docs-website/docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ module.exports = {
announcementBar: {
id: "announcement-3",
content:
'<div style="display: flex; justify-content: center; align-items: center;width: 100%;"><!--img src="/img/acryl-logo-white-mark.svg" / --><!--div style="font-size: .8rem; font-weight: 600; background-color: white; color: #111; padding: 0px 8px; border-radius: 4px; margin-right:12px;">NEW</div--><p>Learn about DataHub 1.0 launching at our 5th birthday party!</p><a href="https://lu.ma/0j5jcocn" target="_blank" class="button">Register<span> →</span></a></div>',
'<div style="display: flex; justify-content: center; align-items: center;width: 100%;"><!--img src="/img/acryl-logo-white-mark.svg" / --><!--div style="font-size: .8rem; font-weight: 600; background-color: white; color: #111; padding: 0px 8px; border-radius: 4px; margin-right:12px;">NEW</div--><p>Learn about DataHub 1.0</p><a href="https://youtu.be/B3IA6cLaKEk" target="_blank" class="button">Watch Now<span> →</span></a></div>',
backgroundColor: "#111",
textColor: "#ffffff",
isCloseable: false,
Expand Down
20 changes: 15 additions & 5 deletions docs-website/src/pages/_components/CaseStudy/caseStudyContent.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ const caseStudyData = [
link: "https://www.youtube.com/watch?v=YoxTg8tQSwg",
},
{
title: "Reliable Data Products",
title: "Powering Discovery in Slack's data ecosystem",
description:
"How Miro leverages DataHub Cloud to deliver reliable data products.",
"Learn why DataHub was the obvious choice for Slack to solve their lineage and discovery needs.",
tag: "Technology",
backgroundImage:
"https://miro.com/blog/wp-content/uploads/2024/08/header-diagramming-s4-02.png",
image: "/img/logos/companies/miro.png",
link: "https://miro.com/careers/life-at-miro/tech/data-products-reliability-the-power-of-metadata/",
"https://i.pcmag.com/imagery/reviews/07td46ju7p6lLVb0QGwc5VF-19.fit_lim.size_1050x.png",
image: "/img/logos/companies/slack.png",
link: "https://datahubproject.io/adoption-stories/#slack",
},
{
title: "Working with Petabyte Scale Healthcare Data",
Expand Down Expand Up @@ -89,6 +89,16 @@ const caseStudyData = [
image: "/img/logos/companies/zynga.png",
link: "https://datahubproject.io/adoption-stories/#zynga",
},
{
title: "Reliable Data Products",
description:
"How Miro leverages DataHub Cloud to deliver reliable data products.",
tag: "Technology",
backgroundImage:
"https://miro.com/blog/wp-content/uploads/2024/08/header-diagramming-s4-02.png",
image: "/img/logos/companies/miro.png",
link: "https://miro.com/careers/life-at-miro/tech/data-products-reliability-the-power-of-metadata/",
},
{
title: "And many more...",
description:
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs-website/static/img/logos/companies/etsy.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs-website/static/img/logos/companies/slack.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ Please see our [Integrations page](https://datahubproject.io/integrations) if yo
| [datahub-lineage-file](./generated/ingestion/sources/file-based-lineage.md) | _no additional dependencies_ | Lineage File source |
| [datahub-business-glossary](./generated/ingestion/sources/business-glossary.md) | _no additional dependencies_ | Business Glossary File source |
| [dbt](./generated/ingestion/sources/dbt.md) | _no additional dependencies_ | dbt source |
| [dremio](./generated/ingestion/sources/dremio.md) | `pip install 'acryl-datahub[dremio]'` | Dremio Source |
| [dremio](./generated/ingestion/sources/dremio.md) | `pip install 'acryl-datahub[dremio]'` | Dremio Source |
| [druid](./generated/ingestion/sources/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
| [feast](./generated/ingestion/sources/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source (0.26.0) |
| [glue](./generated/ingestion/sources/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source |
Expand All @@ -759,6 +759,7 @@ Please see our [Integrations page](https://datahubproject.io/integrations) if yo
| [redash](./generated/ingestion/sources/redash.md) | `pip install 'acryl-datahub[redash]'` | Redash source |
| [redshift](./generated/ingestion/sources/redshift.md) | `pip install 'acryl-datahub[redshift]'` | Redshift source |
| [sagemaker](./generated/ingestion/sources/sagemaker.md) | `pip install 'acryl-datahub[sagemaker]'` | AWS SageMaker source |
| [salesforce](./generated/ingestion/sources/salesforce.md) | `pip install 'acryl-datahub[salesforce]'` | Salesforce source |
| [snowflake](./generated/ingestion/sources/snowflake.md) | `pip install 'acryl-datahub[snowflake]'` | Snowflake source |
| [sqlalchemy](./generated/ingestion/sources/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
| [superset](./generated/ingestion/sources/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
Expand Down
33 changes: 17 additions & 16 deletions metadata-ingestion/src/datahub/ingestion/source/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,26 +219,27 @@ def construct_schema_pymongo(
"""

aggregations: List[Dict] = []

# The order of the aggregations impacts execution time. By setting the sample/limit aggregation first,
# the subsequent aggregations process a much smaller dataset, improving performance.
if sample_size:
if use_random_sampling:
aggregations.append({"$sample": {"size": sample_size}})
else:
aggregations.append({"$limit": sample_size})

if should_add_document_size_filter:
doc_size_field = "temporary_doc_size_field"
# create a temporary field to store the size of the document. filter on it and then remove it.
aggregations = [
{"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
{"$match": {doc_size_field: {"$lt": max_document_size}}},
{"$project": {doc_size_field: 0}},
]
if use_random_sampling:
# get sample documents in collection
if sample_size:
aggregations.append({"$sample": {"size": sample_size}})
documents = collection.aggregate(
aggregations,
allowDiskUse=True,
aggregations.extend(
[
{"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
{"$match": {doc_size_field: {"$lt": max_document_size}}},
{"$project": {doc_size_field: 0}},
]
)
else:
if sample_size:
aggregations.append({"$limit": sample_size})
documents = collection.aggregate(aggregations, allowDiskUse=True)

documents = collection.aggregate(aggregations, allowDiskUse=True)

return construct_schema(list(documents), delimiter)

Expand Down
19 changes: 14 additions & 5 deletions metadata-ingestion/src/datahub/ingestion/source/s3/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,8 +866,21 @@ def get_folder_info(
Returns:
List[Folder]: A list of Folder objects representing the partitions found.
"""

def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
allowed = path_spec_.allowed(s3_uri)
if not allowed:
logger.debug(f"File {s3_uri} not allowed and skipping")
self.report.report_file_dropped(s3_uri)
return allowed

s3_objects = (
obj
for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
if _is_allowed_path(path_spec, f"s3://{obj.bucket_name}/{obj.key}")
)

partitions: List[Folder] = []
s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
grouped_s3_objects_by_dirname = groupby_unsorted(
s3_objects,
key=lambda obj: obj.key.rsplit("/", 1)[0],
Expand All @@ -878,10 +891,6 @@ def get_folder_info(
modification_time = None

for item in group:
file_path = self.create_s3_path(item.bucket_name, item.key)
if not path_spec.allowed(file_path):
logger.debug(f"File {file_path} not allowed and skipping")
continue
file_size += item.size
if creation_time is None or item.last_modified < creation_time:
creation_time = item.last_modified
Expand Down
Loading

0 comments on commit b53016e

Please sign in to comment.