Merge remote-tracking branch 'origin' into add-timeout-to-batch-writer

getsentry · Jun 25, 2024 · dad7f7a · dad7f7a
2 parents 296fbe1 + 4a28b37
commit dad7f7a
Show file tree

Hide file tree

Showing 14 changed files with 511 additions and 320 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -1,6 +1,13 @@
 {
     "version": "0.2.0",
     "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        },
         {
             "name": "Pytest Current File",
             "type": "debugpy",
@@ -10,7 +17,6 @@
                 "${file}"
             ],
             "console": "integratedTerminal",
-            "justMyCode": true  // wasn't sure if I should keep or delete
         }
     ]
 }
diff --git a/snuba/cli/migrations.py b/snuba/cli/migrations.py
@@ -1,4 +1,5 @@
 import os
+import re
 from typing import Optional, Sequence
 
 import click
@@ -392,5 +393,11 @@ def generate(
     The migration will be written into the local directory. The user is responsible for making
     the commit, PR, and merging.
     """
+    expected_pattern = r"(.+/)?snuba/datasets/configuration/.*/storages/.*\.(yml|yaml)"
+    if not re.fullmatch(expected_pattern, storage_path):
+        raise click.ClickException(
+            f"Storage path {storage_path} does not match expected pattern {expected_pattern}"
+        )
+
     autogeneration.generate(storage_path)
     click.echo("This function is under construction.")
diff --git a/snuba/datasets/configuration/replays/storages/replays.yaml b/snuba/datasets/configuration/replays/storages/replays.yaml
@@ -190,19 +190,19 @@ allocation_policies:
         - project_id
       default_config_overrides:
         is_enforced: 0
-  - name: BytesScannedWindowAllocationPolicy
+  - name: ReferrerGuardRailPolicy
     args:
       required_tenant_types:
-        - organization_id
         - referrer
       default_config_overrides:
-        is_enforced: 1
-        throttled_thread_number: 1
-        org_limit_bytes_scanned: 100000
-  - name: ReferrerGuardRailPolicy
+        is_enforced: 0
+        is_active: 0
+  - name: BytesScannedRejectingPolicy
     args:
       required_tenant_types:
+        - organization_id
+        - project_id
         - referrer
       default_config_overrides:
-        is_enforced: 0
         is_active: 0
+        is_enforced: 0
diff --git a/snuba/datasets/configuration/spans/storages/spans.yaml b/snuba/datasets/configuration/spans/storages/spans.yaml
@@ -115,22 +115,23 @@ allocation_policies:
         - project_id
       default_config_overrides:
         is_enforced: 0
-  - name: BytesScannedWindowAllocationPolicy
+  - name: ReferrerGuardRailPolicy
     args:
       required_tenant_types:
-        - organization_id
         - referrer
       default_config_overrides:
-        is_enforced: 1
-        throttled_thread_number: 1
-        org_limit_bytes_scanned: 100000
-  - name: ReferrerGuardRailPolicy
+        is_enforced: 0
+        is_active: 0
+  - name: BytesScannedRejectingPolicy
     args:
       required_tenant_types:
+        - organization_id
+        - project_id
         - referrer
       default_config_overrides:
-        is_enforced: 0
         is_active: 0
+        is_enforced: 0
+
 query_processors:
   - processor: UniqInSelectAndHavingProcessor
   - processor: UUIDColumnProcessor

diff --git a/snuba/migrations/autogeneration/diff.py b/snuba/migrations/autogeneration/diff.py
@@ -0,0 +1,139 @@
+from typing import cast
+
+import yaml
+
+from snuba.clusters.storage_sets import StorageSetKey
+from snuba.datasets.configuration.utils import parse_columns
+from snuba.migrations.columns import MigrationModifiers
+from snuba.migrations.operations import AddColumn, DropColumn, OperationTarget
+from snuba.utils.schemas import Column, ColumnType, SchemaModifiers
+
+"""
+This file is for autogenerating the migration for adding a column to your storage.
+"""
+
+
+def generate_migration_ops(
+    oldstorage: str, newstorage: str
+) -> tuple[list[AddColumn], list[DropColumn]]:
+    """
+    Input:
+        old_storage, the original storage yaml in str format
+        new_storage, the modified storage yaml in str format
+
+    Returns a tuple (forwardops, backwardsops) this are the forward and backward migration
+    operations required to migrate the storage as described in the given yaml files.
+
+    Only supports adding columns, throws error for anything else.
+    """
+    valid, reason = _is_valid_add_column(oldstorage, newstorage)
+    if not valid:
+        raise ValueError(reason)
+
+    oldcol_names = set(
+        col["name"] for col in yaml.safe_load(oldstorage)["schema"]["columns"]
+    )
+    newstorage_dict = yaml.safe_load(newstorage)
+    newcols = newstorage_dict["schema"]["columns"]
+
+    forwardops: list[AddColumn] = []
+    for i, col in enumerate(newcols):
+        if col["name"] not in oldcol_names:
+            column = _schema_column_to_migration_column(parse_columns([col])[0])
+            after = newcols[i - 1]["name"]
+            storage_set = StorageSetKey(newstorage_dict["storage"]["set_key"])
+            forwardops += [
+                AddColumn(
+                    storage_set=storage_set,
+                    table_name=newstorage_dict["schema"]["local_table_name"],
+                    column=column,
+                    after=after,
+                    target=OperationTarget.LOCAL,
+                ),
+                AddColumn(
+                    storage_set=storage_set,
+                    table_name=newstorage_dict["schema"]["dist_table_name"],
+                    column=column,
+                    after=after,
+                    target=OperationTarget.DISTRIBUTED,
+                ),
+            ]
+    return (forwardops, [op.get_reverse() for op in reversed(forwardops)])
+
+
+def _is_valid_add_column(oldstorage: str, newstorage: str) -> tuple[bool, str]:
+    """
+    Input:
+        old_storage, the old storage yaml in str format
+        new_storage, the modified storage yaml in str format
+
+    Returns true if the changes to the storage is valid column addition, false otherwise,
+    along with a reasoning.
+    """
+    oldstorage_dict = yaml.safe_load(oldstorage)
+    newstorage_dict = yaml.safe_load(newstorage)
+    if oldstorage_dict == newstorage_dict:
+        return True, "storages are the same"
+
+    # nothing changed but the columns
+    t1 = oldstorage_dict["schema"].pop("columns")
+    t2 = newstorage_dict["schema"].pop("columns")
+    if not (oldstorage_dict == newstorage_dict):
+        return (
+            False,
+            "Expected the only change to the storage to be the columns, but that is not true",
+        )
+    oldstorage_dict["schema"]["columns"] = t1
+    newstorage_dict["schema"]["columns"] = t2
+
+    # only changes to columns is additions
+    oldstorage_cols = oldstorage_dict["schema"]["columns"]
+    newstorage_cols = newstorage_dict["schema"]["columns"]
+
+    colnames_old = set(e["name"] for e in oldstorage_cols)
+    colnames_new = set(e["name"] for e in newstorage_cols)
+    if not colnames_old.issubset(colnames_new):
+        return (False, "Column removal is not supported")
+
+    pold, pnew = 0, 0
+    while pold < len(oldstorage_cols) and pnew < len(newstorage_cols):
+        curr_old = oldstorage_cols[pold]
+        curr_new = newstorage_cols[pnew]
+
+        if curr_old == curr_new:
+            pold += 1
+            pnew += 1
+        elif curr_new["name"] in colnames_old:
+            return (
+                False,
+                f"Modification to columns in unsupported, column '{curr_new['name']}' was modified or reordered",
+            )
+        else:
+            if pold == 0:
+                return (
+                    False,
+                    "Adding a column to the beginning is currently unsupported, please add it anywhere else.",
+                )
+            else:
+                pnew += 1
+    assert pold == len(oldstorage_cols)  # should always hold
+    return True, ""
+
+
+def _schema_column_to_migration_column(
+    column: Column[SchemaModifiers],
+) -> Column[MigrationModifiers]:
+    """
+    Given SchemaModifiers returns equivalent MigrationModifiers.
+    Only nullable is supported, throws error if conversion cant be made.
+    """
+    newtype = cast(ColumnType[MigrationModifiers], column.type.get_raw())
+    mods = column.type.get_modifiers()
+    if not mods:
+        return Column(column.name, newtype)
+
+    # convert schema modifiers to migration modifiers
+    if mods.readonly:
+        raise ValueError("readonly modifier is not supported")
+    newtype = newtype.set_modifiers(MigrationModifiers(nullable=mods.nullable))
+    return Column(column.name, newtype)
diff --git a/snuba/migrations/autogeneration/main.py b/snuba/migrations/autogeneration/main.py
@@ -1,11 +1,27 @@
 import os
 import subprocess
 
+from snuba.migrations.autogeneration.diff import generate_migration_ops
 
-def generate(storage_path: str) -> tuple[str, str]:
-    storage_path = os.path.realpath(os.path.abspath(os.path.expanduser(storage_path)))
 
-    # get the version of the file at HEAD
+def generate(storage_path: str) -> None:
+    # load into memory the given storage and the version of it at HEAD
+    new_storage, old_storage = get_working_and_head(storage_path)
+
+    # generate the migration operations
+    generate_migration_ops(old_storage, new_storage)
+
+
+def get_working_and_head(path: str) -> tuple[str, str]:
+    """
+    Given a path to a file, returns the contents of the file in the working directory
+    and the contents of it at HEAD in the git repo, as a tuple: (working, head)
+
+    preconditions:
+        - path is a valid path to a file in a git repo
+    """
+    path = os.path.realpath(os.path.abspath(os.path.expanduser(path)))
+    # get the version at HEAD
     try:
         repo_path = (
             subprocess.run(
@@ -14,15 +30,15 @@ def generate(storage_path: str) -> tuple[str, str]:
                     "rev-parse",
                     "--show-toplevel",
                 ],
-                cwd=os.path.dirname(storage_path),
+                cwd=os.path.dirname(path),
                 capture_output=True,
                 check=True,
             )
             .stdout.decode("utf-8")
             .strip()
         )
-        repo_rel_path = os.path.relpath(storage_path, repo_path)
-        old_storage = subprocess.run(
+        repo_rel_path = os.path.relpath(path, repo_path)
+        head_file = subprocess.run(
             ["git", "show", f"HEAD:{repo_rel_path}"],
             cwd=repo_path,
             capture_output=True,
@@ -31,8 +47,8 @@ def generate(storage_path: str) -> tuple[str, str]:
     except subprocess.CalledProcessError as e:
         raise ValueError(e.stderr.decode("utf-8")) from e
 
-    # get the user-provided (modified) storage
-    with open(storage_path, "r") as f:
-        new_storage = f.read()
+    # working
+    with open(path, "r") as f:
+        working_file = f.read()
 
-    return old_storage, new_storage
+    return (working_file, head_file)
diff --git a/snuba/migrations/clickhouse.py b/snuba/migrations/clickhouse.py
@@ -1,4 +1,4 @@
 CLICKHOUSE_SERVER_MIN_VERSION = "21.8.12.29"
 # Note: 21.8.12.29 and 21.8.13.1 are used in self-hosted builds
 # even though SaaS clusters are all on 22.8 or above
-CLICKHOUSE_SERVER_MAX_VERSION = "23.3.19.33"
+CLICKHOUSE_SERVER_MAX_VERSION = "23.8.11.29"