Skip to content

Commit

Permalink
Tooling to support rockset migration (#5366)
Browse files Browse the repository at this point in the history
Adding tooling to analyze rockset query lambdas and collections. This is
meant to be one-off throwaway code, just for use during the early days
of the migration

It includes:
- Code to delete querys that are unused (currently about 80 of 180), in
batches of 10
- Backups for all the querys in case we need to revert a delete

Note: The main file of interest in this PR is rockset_queries.py, which
is the python script version of the generated from the file
rockset_queries.ipynb
  • Loading branch information
ZainRizvi authored Jun 25, 2024
1 parent caec1fc commit cf2eccc
Show file tree
Hide file tree
Showing 363 changed files with 13,209 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tools/rockset_migration/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This folder contains tools/scripts used to help with migrating away from Rockset

Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"workspace": "benchmarks",
"last_updated_by": "[email protected]",
"last_updated": "2024-06-15T23:03:05Z",
"name": "oss_ci_benchmark_branches",
"version_count": 3,
"collections": [
"commons.workflow_run",
"benchmarks.oss_ci_benchmark"
],
"latest_version": {
"workspace": "benchmarks",
"created_by": "[email protected]",
"created_by_apikey_name": null,
"created_at": "2024-06-15T23:03:05Z",
"name": "oss_ci_benchmark_branches",
"version": "76446d877defb748",
"description": "Query branches and commits from OSS CI benchmarks",
"sql": {
"query": "--- This query is used to get the list of branches and commits used by different\n--- OSS CI benchmark experiments. This powers HUD benchmarks dashboards\nSELECT\n DISTINCT w.head_branch,\n w.head_sha,\n w.id,\n FORMAT_ISO8601(\n DATE_TRUNC(: granularity, o._event_time)\n ) AS event_time,\n o.filename\nFROM\n benchmarks.oss_ci_benchmark o\n LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n AND (\n ARRAY_CONTAINS(\n SPLIT(: filenames, ','),\n o.filename\n )\n OR : filenames = ''\n )\n AND o.metric IS NOT NULL\n AND w.html_url LIKE CONCAT('%', : repo, '%')\n AND o.dtype IS NOT NULL\n AND o.device IS NOT NULL\nORDER BY\n w.head_branch,\n event_time DESC",
"default_parameters": [
{
"name": "filenames",
"type": "string",
"value": ""
},
{
"name": "granularity",
"type": "string",
"value": "day"
},
{
"name": "repo",
"type": "string",
"value": "pytorch/pytorch"
},
{
"name": "startTime",
"type": "string",
"value": "2024-05-01T00:00:00.00Z"
},
{
"name": "stopTime",
"type": "string",
"value": "2024-08-01T00:00:00.00Z"
}
]
},
"collections": [
"commons.workflow_run",
"benchmarks.oss_ci_benchmark"
],
"state": "ACTIVE",
"stats": {
"last_executed": "2024-06-25T07:35:30Z",
"last_executed_by": "[email protected]",
"last_execution_error": null,
"last_execution_error_message": null
},
"public_access_id": null
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"query": "--- This query is used to get the list of branches and commits used by different\n--- OSS CI benchmark experiments. This powers HUD benchmarks dashboards\nSELECT\n DISTINCT w.head_branch,\n w.head_sha,\n w.id,\n FORMAT_ISO8601(\n DATE_TRUNC(: granularity, o._event_time)\n ) AS event_time,\n o.filename\nFROM\n benchmarks.oss_ci_benchmark o\n LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n AND (\n ARRAY_CONTAINS(\n SPLIT(: filenames, ','),\n o.filename\n )\n OR : filenames = ''\n )\n AND o.metric IS NOT NULL\n AND w.html_url LIKE CONCAT('%', : repo, '%')\n AND o.dtype IS NOT NULL\n AND o.device IS NOT NULL\nORDER BY\n w.head_branch,\n event_time DESC",
"default_parameters": [
{
"name": "filenames",
"type": "string",
"value": ""
},
{
"name": "granularity",
"type": "string",
"value": "day"
},
{
"name": "repo",
"type": "string",
"value": "pytorch/pytorch"
},
{
"name": "startTime",
"type": "string",
"value": "2024-05-01T00:00:00.00Z"
},
{
"name": "stopTime",
"type": "string",
"value": "2024-08-01T00:00:00.00Z"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"workspace": "benchmarks",
"last_updated_by": "[email protected]",
"last_updated": "2024-06-19T19:40:01Z",
"name": "oss_ci_benchmark_llms",
"version_count": 6,
"collections": [
"commons.workflow_run",
"benchmarks.oss_ci_benchmark"
],
"latest_version": {
"workspace": "benchmarks",
"created_by": "[email protected]",
"created_by_apikey_name": null,
"created_at": "2024-06-19T19:40:01Z",
"name": "oss_ci_benchmark_llms",
"version": "656fe095f7e9a3ab",
"description": "The query to power LLMs benchmark dashboard",
"sql": {
"query": "--- This query is used to get the LLMs benchmark results from different experiments. It\n--- queries the TPS and memory bandwidth for each model / quantization combos. This powers\n--- the LLMs benchmark dashboard\nSELECT\n DISTINCT o.workflow_id,\n -- As the JSON response is pretty big, only return the field if it's needed\n IF(:getJobId, o.job_id, NULL) AS job_id,\n o.name,\n o.metric,\n IF(\n o.actual IS NOT NULL,\n CAST(o.actual AS FLOAT), 0.0\n ) AS actual,\n IF(\n o.target IS NOT NULL,\n CAST(o.target AS FLOAT), 0.0\n ) AS target,\n FORMAT_ISO8601(\n DATE_TRUNC(: granularity, w._event_time)\n ) AS granularity_bucket,\n o.dtype,\n o.device,\nFROM\n benchmarks.oss_ci_benchmark o\n LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n (\n ARRAY_CONTAINS(\n SPLIT(: branches, ','),\n w.head_branch\n )\n OR : branches = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: commits, ','),\n w.head_sha\n )\n OR : commits = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: filenames, ','),\n o.filename\n )\n OR : filenames = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: names, ','),\n o.name\n )\n OR : names = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: devices, ','),\n o.device\n )\n OR : devices = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: dtypes, ','),\n o.dtype\n )\n OR : dtypes = ''\n )\n AND o.metric IS NOT NULL\n AND o.dtype IS NOT NULL\n AND o.device IS NOT NULL\n AND w.html_url LIKE CONCAT('%', : repo, '%')\nORDER BY\n granularity_bucket DESC,\n workflow_id DESC,\n name,\n dtype,\n device",
"default_parameters": [
{
"name": "branches",
"type": "string",
"value": "main"
},
{
"name": "commits",
"type": "string",
"value": ""
},
{
"name": "devices",
"type": "string",
"value": ""
},
{
"name": "dtypes",
"type": "string",
"value": ""
},
{
"name": "filenames",
"type": "string",
"value": ""
},
{
"name": "getJobId",
"type": "bool",
"value": "false"
},
{
"name": "granularity",
"type": "string",
"value": "day"
},
{
"name": "names",
"type": "string",
"value": ""
},
{
"name": "repo",
"type": "string",
"value": "pytorch/pytorch"
},
{
"name": "startTime",
"type": "string",
"value": "2024-05-01T00:00:00.00Z"
},
{
"name": "stopTime",
"type": "string",
"value": "2024-08-01T00:00:00.00Z"
}
]
},
"collections": [
"commons.workflow_run",
"benchmarks.oss_ci_benchmark"
],
"state": "ACTIVE",
"stats": {
"last_executed": "2024-06-25T07:35:29Z",
"last_executed_by": "[email protected]",
"last_execution_error": null,
"last_execution_error_message": null
},
"public_access_id": null
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"query": "--- This query is used to get the LLMs benchmark results from different experiments. It\n--- queries the TPS and memory bandwidth for each model / quantization combos. This powers\n--- the LLMs benchmark dashboard\nSELECT\n DISTINCT o.workflow_id,\n -- As the JSON response is pretty big, only return the field if it's needed\n IF(:getJobId, o.job_id, NULL) AS job_id,\n o.name,\n o.metric,\n IF(\n o.actual IS NOT NULL,\n CAST(o.actual AS FLOAT), 0.0\n ) AS actual,\n IF(\n o.target IS NOT NULL,\n CAST(o.target AS FLOAT), 0.0\n ) AS target,\n FORMAT_ISO8601(\n DATE_TRUNC(: granularity, w._event_time)\n ) AS granularity_bucket,\n o.dtype,\n o.device,\nFROM\n benchmarks.oss_ci_benchmark o\n LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n (\n ARRAY_CONTAINS(\n SPLIT(: branches, ','),\n w.head_branch\n )\n OR : branches = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: commits, ','),\n w.head_sha\n )\n OR : commits = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: filenames, ','),\n o.filename\n )\n OR : filenames = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: names, ','),\n o.name\n )\n OR : names = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: devices, ','),\n o.device\n )\n OR : devices = ''\n )\n AND (\n ARRAY_CONTAINS(\n SPLIT(: dtypes, ','),\n o.dtype\n )\n OR : dtypes = ''\n )\n AND o.metric IS NOT NULL\n AND o.dtype IS NOT NULL\n AND o.device IS NOT NULL\n AND w.html_url LIKE CONCAT('%', : repo, '%')\nORDER BY\n granularity_bucket DESC,\n workflow_id DESC,\n name,\n dtype,\n device",
"default_parameters": [
{
"name": "branches",
"type": "string",
"value": "main"
},
{
"name": "commits",
"type": "string",
"value": ""
},
{
"name": "devices",
"type": "string",
"value": ""
},
{
"name": "dtypes",
"type": "string",
"value": ""
},
{
"name": "filenames",
"type": "string",
"value": ""
},
{
"name": "getJobId",
"type": "bool",
"value": "false"
},
{
"name": "granularity",
"type": "string",
"value": "day"
},
{
"name": "names",
"type": "string",
"value": ""
},
{
"name": "repo",
"type": "string",
"value": "pytorch/pytorch"
},
{
"name": "startTime",
"type": "string",
"value": "2024-05-01T00:00:00.00Z"
},
{
"name": "stopTime",
"type": "string",
"value": "2024-08-01T00:00:00.00Z"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"workspace": "benchmarks",
"last_updated_by": "[email protected]",
"last_updated": "2024-06-16T06:09:30Z",
"name": "oss_ci_benchmark_names",
"version_count": 5,
"collections": [
"commons.workflow_run",
"benchmarks.oss_ci_benchmark"
],
"latest_version": {
"workspace": "benchmarks",
"created_by": "[email protected]",
"created_by_apikey_name": null,
"created_at": "2024-06-16T06:09:30Z",
"name": "oss_ci_benchmark_names",
"version": "98a212e928df968b",
"description": "Query experiment names from OSS CI benchmarks",
"sql": {
"query": "--- This query is used by HUD benchmarks dashboards to get the list of experiment names\nSELECT DISTINCT\n o.filename, \n o.name, \n o.metric,\n o.dtype,\n o.device,\nFROM\n benchmarks.oss_ci_benchmark o\n LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n AND (\n ARRAY_CONTAINS(\n SPLIT(: filenames, ','),\n o.filename\n )\n OR : filenames = ''\n )\n AND o.metric IS NOT NULL\n AND w.html_url LIKE CONCAT('%', : repo, '%')\n AND o.dtype IS NOT NULL\n AND o.device IS NOT NULL\nORDER BY\n o.filename, \n o.name,\n o.metric,\n o.dtype,\n o.device",
"default_parameters": [
{
"name": "filenames",
"type": "string",
"value": ""
},
{
"name": "granularity",
"type": "string",
"value": "day"
},
{
"name": "repo",
"type": "string",
"value": "pytorch/pytorch"
},
{
"name": "startTime",
"type": "string",
"value": "2024-05-01T00:00:00.00Z"
},
{
"name": "stopTime",
"type": "string",
"value": "2024-08-01T00:00:00.00Z"
}
]
},
"collections": [
"commons.workflow_run",
"benchmarks.oss_ci_benchmark"
],
"state": "ACTIVE",
"stats": {
"last_executed": "2024-06-25T07:35:28Z",
"last_executed_by": "[email protected]",
"last_execution_error": null,
"last_execution_error_message": null
},
"public_access_id": null
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"query": "--- This query is used by HUD benchmarks dashboards to get the list of experiment names\nSELECT DISTINCT\n o.filename, \n o.name, \n o.metric,\n o.dtype,\n o.device,\nFROM\n benchmarks.oss_ci_benchmark o\n LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n AND (\n ARRAY_CONTAINS(\n SPLIT(: filenames, ','),\n o.filename\n )\n OR : filenames = ''\n )\n AND o.metric IS NOT NULL\n AND w.html_url LIKE CONCAT('%', : repo, '%')\n AND o.dtype IS NOT NULL\n AND o.device IS NOT NULL\nORDER BY\n o.filename, \n o.name,\n o.metric,\n o.dtype,\n o.device",
"default_parameters": [
{
"name": "filenames",
"type": "string",
"value": ""
},
{
"name": "granularity",
"type": "string",
"value": "day"
},
{
"name": "repo",
"type": "string",
"value": "pytorch/pytorch"
},
{
"name": "startTime",
"type": "string",
"value": "2024-05-01T00:00:00.00Z"
},
{
"name": "stopTime",
"type": "string",
"value": "2024-08-01T00:00:00.00Z"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"workspace": "commons",
"last_updated_by": "[email protected]",
"last_updated": "2022-01-16T08:24:39Z",
"name": "GHA-CI-for-shas",
"version_count": 6,
"collections": [
"commons.workflow_run",
"GitHub-Actions.workflow_run"
],
"latest_version": {
"workspace": "commons",
"created_by": "[email protected]",
"created_by_apikey_name": null,
"created_at": "2022-01-16T08:24:39Z",
"name": "GHA-CI-for-shas",
"version": "ae1b83292611eff2",
"description": "Get GHA results for a specific set of SHAs",
"sql": {
"query": "SELECT head_sha, head_branch, html_url, name, status, conclusion\nFROM workflow_run\nWHERE ARRAY_CONTAINS(SPLIT(:shas, ','), head_sha)",
"default_parameters": [
{
"name": "shas",
"type": "string",
"value": ""
}
]
},
"collections": [
"commons.workflow_run"
],
"state": "ACTIVE",
"stats": {
"last_executed": null,
"last_executed_by": null,
"last_execution_error": null,
"last_execution_error_message": null
},
"public_access_id": null
}
}
Loading

0 comments on commit cf2eccc

Please sign in to comment.