Tooling to support rockset migration (#5366)

Adding tooling to analyze rockset query lambdas and collections. This is meant to be one-off throwaway code, just for use during the early days of the migration It includes: - Code to delete querys that are unused (currently about 80 of 180), in batches of 10 - Backups for all the querys in case we need to revert a delete Note: The main file of interest in this PR is rockset_queries.py, which is the python script version of the generated from the file rockset_queries.ipynb
pytorch · Jun 25, 2024 · cf2eccc · cf2eccc
1 parent caec1fc
commit cf2eccc
Show file tree

Hide file tree

Showing 363 changed files with 13,209 additions and 0 deletions.
diff --git a/tools/rockset_migration/README.md b/tools/rockset_migration/README.md
@@ -0,0 +1,2 @@
+This folder contains tools/scripts used to help with migrating away from Rockset
+
diff --git a/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_branches.raw.json b/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_branches.raw.json
@@ -0,0 +1,62 @@
+{
+  "workspace": "benchmarks",
+  "last_updated_by": "[email protected]",
+  "last_updated": "2024-06-15T23:03:05Z",
+  "name": "oss_ci_benchmark_branches",
+  "version_count": 3,
+  "collections": [
+    "commons.workflow_run",
+    "benchmarks.oss_ci_benchmark"
+  ],
+  "latest_version": {
+    "workspace": "benchmarks",
+    "created_by": "[email protected]",
+    "created_by_apikey_name": null,
+    "created_at": "2024-06-15T23:03:05Z",
+    "name": "oss_ci_benchmark_branches",
+    "version": "76446d877defb748",
+    "description": "Query branches and commits from OSS CI benchmarks",
+    "sql": {
+      "query": "--- This query is used to get the list of branches and commits used by different\n--- OSS CI benchmark experiments. This powers HUD benchmarks dashboards\nSELECT\n  DISTINCT w.head_branch,\n  w.head_sha,\n  w.id,\n  FORMAT_ISO8601(\n    DATE_TRUNC(: granularity, o._event_time)\n  ) AS event_time,\n  o.filename\nFROM\n  benchmarks.oss_ci_benchmark o\n  LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n  o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n  AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: filenames, ','),\n      o.filename\n    )\n    OR : filenames = ''\n  )\n  AND o.metric IS NOT NULL\n  AND w.html_url LIKE CONCAT('%', : repo, '%')\n  AND o.dtype IS NOT NULL\n  AND o.device IS NOT NULL\nORDER BY\n  w.head_branch,\n  event_time DESC",
+      "default_parameters": [
+        {
+          "name": "filenames",
+          "type": "string",
+          "value": ""
+        },
+        {
+          "name": "granularity",
+          "type": "string",
+          "value": "day"
+        },
+        {
+          "name": "repo",
+          "type": "string",
+          "value": "pytorch/pytorch"
+        },
+        {
+          "name": "startTime",
+          "type": "string",
+          "value": "2024-05-01T00:00:00.00Z"
+        },
+        {
+          "name": "stopTime",
+          "type": "string",
+          "value": "2024-08-01T00:00:00.00Z"
+        }
+      ]
+    },
+    "collections": [
+      "commons.workflow_run",
+      "benchmarks.oss_ci_benchmark"
+    ],
+    "state": "ACTIVE",
+    "stats": {
+      "last_executed": "2024-06-25T07:35:30Z",
+      "last_executed_by": "[email protected]",
+      "last_execution_error": null,
+      "last_execution_error_message": null
+    },
+    "public_access_id": null
+  }
+}
diff --git a/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_branches.sql.json b/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_branches.sql.json
@@ -0,0 +1,30 @@
+{
+  "query": "--- This query is used to get the list of branches and commits used by different\n--- OSS CI benchmark experiments. This powers HUD benchmarks dashboards\nSELECT\n  DISTINCT w.head_branch,\n  w.head_sha,\n  w.id,\n  FORMAT_ISO8601(\n    DATE_TRUNC(: granularity, o._event_time)\n  ) AS event_time,\n  o.filename\nFROM\n  benchmarks.oss_ci_benchmark o\n  LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n  o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n  AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: filenames, ','),\n      o.filename\n    )\n    OR : filenames = ''\n  )\n  AND o.metric IS NOT NULL\n  AND w.html_url LIKE CONCAT('%', : repo, '%')\n  AND o.dtype IS NOT NULL\n  AND o.device IS NOT NULL\nORDER BY\n  w.head_branch,\n  event_time DESC",
+  "default_parameters": [
+    {
+      "name": "filenames",
+      "type": "string",
+      "value": ""
+    },
+    {
+      "name": "granularity",
+      "type": "string",
+      "value": "day"
+    },
+    {
+      "name": "repo",
+      "type": "string",
+      "value": "pytorch/pytorch"
+    },
+    {
+      "name": "startTime",
+      "type": "string",
+      "value": "2024-05-01T00:00:00.00Z"
+    },
+    {
+      "name": "stopTime",
+      "type": "string",
+      "value": "2024-08-01T00:00:00.00Z"
+    }
+  ]
+}
diff --git a/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_llms.raw.json b/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_llms.raw.json
@@ -0,0 +1,92 @@
+{
+  "workspace": "benchmarks",
+  "last_updated_by": "[email protected]",
+  "last_updated": "2024-06-19T19:40:01Z",
+  "name": "oss_ci_benchmark_llms",
+  "version_count": 6,
+  "collections": [
+    "commons.workflow_run",
+    "benchmarks.oss_ci_benchmark"
+  ],
+  "latest_version": {
+    "workspace": "benchmarks",
+    "created_by": "[email protected]",
+    "created_by_apikey_name": null,
+    "created_at": "2024-06-19T19:40:01Z",
+    "name": "oss_ci_benchmark_llms",
+    "version": "656fe095f7e9a3ab",
+    "description": "The query to power LLMs benchmark dashboard",
+    "sql": {
+      "query": "--- This query is used to get the LLMs benchmark results from different experiments. It\n--- queries the TPS and memory bandwidth for each model / quantization combos. This powers\n--- the LLMs benchmark dashboard\nSELECT\n  DISTINCT o.workflow_id,\n  -- As the JSON response is pretty big, only return the field if it's needed\n  IF(:getJobId, o.job_id, NULL) AS job_id,\n  o.name,\n  o.metric,\n  IF(\n    o.actual IS NOT NULL,\n    CAST(o.actual AS FLOAT), 0.0\n  ) AS actual,\n  IF(\n    o.target IS NOT NULL,\n    CAST(o.target AS FLOAT), 0.0\n  ) AS target,\n  FORMAT_ISO8601(\n    DATE_TRUNC(: granularity, w._event_time)\n  ) AS granularity_bucket,\n  o.dtype,\n  o.device,\nFROM\n  benchmarks.oss_ci_benchmark o\n  LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n  (\n    ARRAY_CONTAINS(\n      SPLIT(: branches, ','),\n      w.head_branch\n    )\n    OR : branches = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: commits, ','),\n      w.head_sha\n    )\n    OR : commits = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: filenames, ','),\n      o.filename\n    )\n    OR : filenames = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: names, ','),\n      o.name\n    )\n    OR : names = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: devices, ','),\n      o.device\n    )\n    OR : devices = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: dtypes, ','),\n      o.dtype\n    )\n    OR : dtypes = ''\n  )\n  AND o.metric IS NOT NULL\n  AND o.dtype IS NOT NULL\n  AND o.device IS NOT NULL\n  AND w.html_url LIKE CONCAT('%', : repo, '%')\nORDER BY\n  granularity_bucket DESC,\n  workflow_id DESC,\n  name,\n  dtype,\n  device",
+      "default_parameters": [
+        {
+          "name": "branches",
+          "type": "string",
+          "value": "main"
+        },
+        {
+          "name": "commits",
+          "type": "string",
+          "value": ""
+        },
+        {
+          "name": "devices",
+          "type": "string",
+          "value": ""
+        },
+        {
+          "name": "dtypes",
+          "type": "string",
+          "value": ""
+        },
+        {
+          "name": "filenames",
+          "type": "string",
+          "value": ""
+        },
+        {
+          "name": "getJobId",
+          "type": "bool",
+          "value": "false"
+        },
+        {
+          "name": "granularity",
+          "type": "string",
+          "value": "day"
+        },
+        {
+          "name": "names",
+          "type": "string",
+          "value": ""
+        },
+        {
+          "name": "repo",
+          "type": "string",
+          "value": "pytorch/pytorch"
+        },
+        {
+          "name": "startTime",
+          "type": "string",
+          "value": "2024-05-01T00:00:00.00Z"
+        },
+        {
+          "name": "stopTime",
+          "type": "string",
+          "value": "2024-08-01T00:00:00.00Z"
+        }
+      ]
+    },
+    "collections": [
+      "commons.workflow_run",
+      "benchmarks.oss_ci_benchmark"
+    ],
+    "state": "ACTIVE",
+    "stats": {
+      "last_executed": "2024-06-25T07:35:29Z",
+      "last_executed_by": "[email protected]",
+      "last_execution_error": null,
+      "last_execution_error_message": null
+    },
+    "public_access_id": null
+  }
+}
diff --git a/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_llms.sql.json b/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_llms.sql.json
@@ -0,0 +1,60 @@
+{
+  "query": "--- This query is used to get the LLMs benchmark results from different experiments. It\n--- queries the TPS and memory bandwidth for each model / quantization combos. This powers\n--- the LLMs benchmark dashboard\nSELECT\n  DISTINCT o.workflow_id,\n  -- As the JSON response is pretty big, only return the field if it's needed\n  IF(:getJobId, o.job_id, NULL) AS job_id,\n  o.name,\n  o.metric,\n  IF(\n    o.actual IS NOT NULL,\n    CAST(o.actual AS FLOAT), 0.0\n  ) AS actual,\n  IF(\n    o.target IS NOT NULL,\n    CAST(o.target AS FLOAT), 0.0\n  ) AS target,\n  FORMAT_ISO8601(\n    DATE_TRUNC(: granularity, w._event_time)\n  ) AS granularity_bucket,\n  o.dtype,\n  o.device,\nFROM\n  benchmarks.oss_ci_benchmark o\n  LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n  (\n    ARRAY_CONTAINS(\n      SPLIT(: branches, ','),\n      w.head_branch\n    )\n    OR : branches = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: commits, ','),\n      w.head_sha\n    )\n    OR : commits = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: filenames, ','),\n      o.filename\n    )\n    OR : filenames = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: names, ','),\n      o.name\n    )\n    OR : names = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: devices, ','),\n      o.device\n    )\n    OR : devices = ''\n  )\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: dtypes, ','),\n      o.dtype\n    )\n    OR : dtypes = ''\n  )\n  AND o.metric IS NOT NULL\n  AND o.dtype IS NOT NULL\n  AND o.device IS NOT NULL\n  AND w.html_url LIKE CONCAT('%', : repo, '%')\nORDER BY\n  granularity_bucket DESC,\n  workflow_id DESC,\n  name,\n  dtype,\n  device",
+  "default_parameters": [
+    {
+      "name": "branches",
+      "type": "string",
+      "value": "main"
+    },
+    {
+      "name": "commits",
+      "type": "string",
+      "value": ""
+    },
+    {
+      "name": "devices",
+      "type": "string",
+      "value": ""
+    },
+    {
+      "name": "dtypes",
+      "type": "string",
+      "value": ""
+    },
+    {
+      "name": "filenames",
+      "type": "string",
+      "value": ""
+    },
+    {
+      "name": "getJobId",
+      "type": "bool",
+      "value": "false"
+    },
+    {
+      "name": "granularity",
+      "type": "string",
+      "value": "day"
+    },
+    {
+      "name": "names",
+      "type": "string",
+      "value": ""
+    },
+    {
+      "name": "repo",
+      "type": "string",
+      "value": "pytorch/pytorch"
+    },
+    {
+      "name": "startTime",
+      "type": "string",
+      "value": "2024-05-01T00:00:00.00Z"
+    },
+    {
+      "name": "stopTime",
+      "type": "string",
+      "value": "2024-08-01T00:00:00.00Z"
+    }
+  ]
+}
diff --git a/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_names.raw.json b/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_names.raw.json
@@ -0,0 +1,62 @@
+{
+  "workspace": "benchmarks",
+  "last_updated_by": "[email protected]",
+  "last_updated": "2024-06-16T06:09:30Z",
+  "name": "oss_ci_benchmark_names",
+  "version_count": 5,
+  "collections": [
+    "commons.workflow_run",
+    "benchmarks.oss_ci_benchmark"
+  ],
+  "latest_version": {
+    "workspace": "benchmarks",
+    "created_by": "[email protected]",
+    "created_by_apikey_name": null,
+    "created_at": "2024-06-16T06:09:30Z",
+    "name": "oss_ci_benchmark_names",
+    "version": "98a212e928df968b",
+    "description": "Query experiment names from OSS CI benchmarks",
+    "sql": {
+      "query": "--- This query is used by HUD benchmarks dashboards to get the list of experiment names\nSELECT DISTINCT\n  o.filename,  \n  o.name,  \n  o.metric,\n  o.dtype,\n  o.device,\nFROM\n  benchmarks.oss_ci_benchmark o\n  LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n  o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n  AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: filenames, ','),\n      o.filename\n    )\n    OR : filenames = ''\n  )\n  AND o.metric IS NOT NULL\n  AND w.html_url LIKE CONCAT('%', : repo, '%')\n  AND o.dtype IS NOT NULL\n  AND o.device IS NOT NULL\nORDER BY\n  o.filename,  \n  o.name,\n  o.metric,\n  o.dtype,\n  o.device",
+      "default_parameters": [
+        {
+          "name": "filenames",
+          "type": "string",
+          "value": ""
+        },
+        {
+          "name": "granularity",
+          "type": "string",
+          "value": "day"
+        },
+        {
+          "name": "repo",
+          "type": "string",
+          "value": "pytorch/pytorch"
+        },
+        {
+          "name": "startTime",
+          "type": "string",
+          "value": "2024-05-01T00:00:00.00Z"
+        },
+        {
+          "name": "stopTime",
+          "type": "string",
+          "value": "2024-08-01T00:00:00.00Z"
+        }
+      ]
+    },
+    "collections": [
+      "commons.workflow_run",
+      "benchmarks.oss_ci_benchmark"
+    ],
+    "state": "ACTIVE",
+    "stats": {
+      "last_executed": "2024-06-25T07:35:28Z",
+      "last_executed_by": "[email protected]",
+      "last_execution_error": null,
+      "last_execution_error_message": null
+    },
+    "public_access_id": null
+  }
+}
diff --git a/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_names.sql.json b/tools/rockset_migration/lambdas_backup/benchmarks.oss_ci_benchmark_names.sql.json
@@ -0,0 +1,30 @@
+{
+  "query": "--- This query is used by HUD benchmarks dashboards to get the list of experiment names\nSELECT DISTINCT\n  o.filename,  \n  o.name,  \n  o.metric,\n  o.dtype,\n  o.device,\nFROM\n  benchmarks.oss_ci_benchmark o\n  LEFT JOIN commons.workflow_run w ON o.workflow_id = w.id\nWHERE\n  o._event_time >= PARSE_DATETIME_ISO8601(: startTime)\n  AND o._event_time < PARSE_DATETIME_ISO8601(: stopTime)\n  AND (\n    ARRAY_CONTAINS(\n      SPLIT(: filenames, ','),\n      o.filename\n    )\n    OR : filenames = ''\n  )\n  AND o.metric IS NOT NULL\n  AND w.html_url LIKE CONCAT('%', : repo, '%')\n  AND o.dtype IS NOT NULL\n  AND o.device IS NOT NULL\nORDER BY\n  o.filename,  \n  o.name,\n  o.metric,\n  o.dtype,\n  o.device",
+  "default_parameters": [
+    {
+      "name": "filenames",
+      "type": "string",
+      "value": ""
+    },
+    {
+      "name": "granularity",
+      "type": "string",
+      "value": "day"
+    },
+    {
+      "name": "repo",
+      "type": "string",
+      "value": "pytorch/pytorch"
+    },
+    {
+      "name": "startTime",
+      "type": "string",
+      "value": "2024-05-01T00:00:00.00Z"
+    },
+    {
+      "name": "stopTime",
+      "type": "string",
+      "value": "2024-08-01T00:00:00.00Z"
+    }
+  ]
+}
diff --git a/tools/rockset_migration/lambdas_backup/commons.GHA-CI-for-shas.raw.json b/tools/rockset_migration/lambdas_backup/commons.GHA-CI-for-shas.raw.json
@@ -0,0 +1,41 @@
+{
+  "workspace": "commons",
+  "last_updated_by": "[email protected]",
+  "last_updated": "2022-01-16T08:24:39Z",
+  "name": "GHA-CI-for-shas",
+  "version_count": 6,
+  "collections": [
+    "commons.workflow_run",
+    "GitHub-Actions.workflow_run"
+  ],
+  "latest_version": {
+    "workspace": "commons",
+    "created_by": "[email protected]",
+    "created_by_apikey_name": null,
+    "created_at": "2022-01-16T08:24:39Z",
+    "name": "GHA-CI-for-shas",
+    "version": "ae1b83292611eff2",
+    "description": "Get GHA results for a specific set of SHAs",
+    "sql": {
+      "query": "SELECT head_sha, head_branch, html_url, name, status, conclusion\nFROM workflow_run\nWHERE ARRAY_CONTAINS(SPLIT(:shas, ','), head_sha)",
+      "default_parameters": [
+        {
+          "name": "shas",
+          "type": "string",
+          "value": ""
+        }
+      ]
+    },
+    "collections": [
+      "commons.workflow_run"
+    ],
+    "state": "ACTIVE",
+    "stats": {
+      "last_executed": null,
+      "last_executed_by": null,
+      "last_execution_error": null,
+      "last_execution_error_message": null
+    },
+    "public_access_id": null
+  }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		This folder contains tools/scripts used to help with migrating away from Rockset