feat: support zstd compression in miniostorage (#405)

* feat: support zstd compression in miniostorage we want to use zstd compression when compressing files for storage in object storage because it performs better than gzip which is what we were using before these changes are only being made to the minio storage service because we want to consolidate the storage service functionality into this one so both worker and API will be using this backend in the future (API was already using this one) we have to manually decompress the zstd compressed files in read_file but HTTPResponse takes care of it for us if the content encoding of the file is gzip the is_already_gzipped argument is being deprecated in favour of compression_type and is_compressed, also the ability to pass a str to write_file is being deprecated. we're keeping track of the use of these using sentry capture_message * fix: address feedback - using fget_object was unecessary since we were streaming the response data regardless - no need for all the warning logs and sentry stuff, we'll just do a 3 step migration in both API and worker (update shared supporting old behaviour, update {api,worker}, remove old behaviour support from shared) - zstandard version pinning can be more flexible - add test for content type = application/x-gzip since there was some specific handling for that in the GCP storage service * fix: update MinioStorageService - in write file: - data arg is not BinaryIO it's actually bytes | str | IO[bytes] bytes and str are self-explanatory it's just how it's being used currently, so we must support it. IO[bytes] is there to support files handles opened with "rb" that are being passed and BytesIO objects - start accepting None value for compression_type which will mean no automatic compression even if is_compressed is false - do automatic compression using gzip if is_compressed=False and compression_type="gzip" - in put_object set size = -1 and use a part_size of 20MiB. the specific part size is arbitrary. Different sources online suggest different numbers. It probably depends on the size of the underlying data we're trying to send but 20MiB seems like a good flat number to pick for now. - in read_file: - generally reorganize the function do spend less time under the try except blocks - use the CHUNK_SIZE const defined in storage/base for the amount to read from the streams - accept IO[bytes] for the file_obj since we don't use any of the BinaryIO specific methods - create GZipStreamReader that takes in a IO[bytes] and implements a read() method that reads a certain amount of bytes from the IO[bytes] compresses whatever it reads using gzip, and returns the result * fix(minio): check urllib3 version in read_file this is because if urllib3 is >= 2.0.0 and the zstd extra is installed then it is capable (and will) decode zstd encoded data when it's used in get_object so when we create the MinioStorageService we check the urllib3 version and we check if it's been installed with the zstd extra this commit also adds a test to ensure that the gzip compression and decompression used in the GzipStreamReader actually works * feat: add feature flag for new minio storage instead of doing a 0-100 launch of the new minio storage service i'd like to have it so we incrementally ship it using a feature flag. so if a repoid is passed to the get_appropriate_storage_service function and the chosen storage is minio, then it will check the use_new_minio feature to decide whether to use the new or old minio storage service as mentioned this will be decided via the repoid (to reduce the impact IF it is broken) changes had to be made to avoid circular imports in the model_utils and rollout_utils files * fix: revert changes to old minio
codecov · Dec 5, 2024 · 3b22b03 · 3b22b03
1 parent 27d6a8f
commit 3b22b03
Show file tree

Hide file tree

Showing 13 changed files with 837 additions and 26 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "requests>=2.32.3",
     "sentry-sdk>=2.13.0",
     "sqlalchemy<2",
+    "zstandard>=0.23.0",
 ]
 
 [build-system]

diff --git a/shared/django_apps/rollouts/models.py b/shared/django_apps/rollouts/models.py
@@ -23,7 +23,7 @@ class RolloutUniverse(models.TextChoices):
 
 def default_random_salt():
     # to resolve circular dependency
-    from shared.django_apps.utils.model_utils import default_random_salt
+    from shared.django_apps.utils.rollout_utils import default_random_salt
 
     return default_random_salt()
 

diff --git a/shared/django_apps/utils/model_utils.py b/shared/django_apps/utils/model_utils.py
@@ -1,10 +1,8 @@
 import json
 import logging
-from random import choice
 from typing import Any, Callable, Optional
 
 from shared.api_archive.archive import ArchiveService
-from shared.django_apps.rollouts.models import RolloutUniverse
 from shared.storage.exceptions import FileNotInStorageError
 from shared.utils.ReportEncoder import ReportEncoder
 
@@ -148,24 +146,6 @@ def __set__(self, obj, value):
         setattr(obj, self.cached_value_property_name, value)
 
 
-def default_random_salt():
-    ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    return "".join([choice(ALPHABET) for _ in range(16)])
-
-
-def rollout_universe_to_override_string(rollout_universe: RolloutUniverse):
-    if rollout_universe == RolloutUniverse.OWNER_ID:
-        return "override_owner_ids"
-    elif rollout_universe == RolloutUniverse.REPO_ID:
-        return "override_repo_ids"
-    elif rollout_universe == RolloutUniverse.EMAIL:
-        return "override_emails"
-    elif rollout_universe == RolloutUniverse.ORG_ID:
-        return "override_org_ids"
-    else:
-        return ""
-
-
 # This is the place for DB trigger logic that's been moved into code
 # Owner
 def get_ownerid_if_member(

diff --git a/shared/django_apps/utils/rollout_utils.py b/shared/django_apps/utils/rollout_utils.py
@@ -0,0 +1,21 @@
+from random import choice
+
+from shared.django_apps.rollouts.models import RolloutUniverse
+
+
+def default_random_salt():
+    ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    return "".join([choice(ALPHABET) for _ in range(16)])
+
+
+def rollout_universe_to_override_string(rollout_universe: RolloutUniverse):
+    if rollout_universe == RolloutUniverse.OWNER_ID:
+        return "override_owner_ids"
+    elif rollout_universe == RolloutUniverse.REPO_ID:
+        return "override_repo_ids"
+    elif rollout_universe == RolloutUniverse.EMAIL:
+        return "override_emails"
+    elif rollout_universe == RolloutUniverse.ORG_ID:
+        return "override_org_ids"
+    else:
+        return ""
diff --git a/shared/rollouts/__init__.py b/shared/rollouts/__init__.py
@@ -17,7 +17,7 @@
     Platform,
     RolloutUniverse,
 )
-from shared.django_apps.utils.model_utils import rollout_universe_to_override_string
+from shared.django_apps.utils.rollout_utils import rollout_universe_to_override_string
 
 log = logging.getLogger("__name__")
 

diff --git a/shared/rollouts/features.py b/shared/rollouts/features.py
@@ -2,3 +2,4 @@
 
 BUNDLE_THRESHOLD_FLAG = Feature("bundle_threshold_flag")
 INCLUDE_GITHUB_COMMENT_ACTIONS_BY_OWNER = Feature("include_github_comment_actions")
+USE_NEW_MINIO = Feature("use_new_minio")
diff --git a/shared/storage/__init__.py b/shared/storage/__init__.py
@@ -1,18 +1,22 @@
 from shared.config import get_config
+from shared.rollouts.features import USE_NEW_MINIO
 from shared.storage.aws import AWSStorageService
 from shared.storage.base import BaseStorageService
 from shared.storage.fallback import StorageWithFallbackService
 from shared.storage.gcp import GCPStorageService
 from shared.storage.minio import MinioStorageService
+from shared.storage.new_minio import NewMinioStorageService
 
 
-def get_appropriate_storage_service() -> BaseStorageService:
-    chosen_storage = get_config("services", "chosen_storage", default="minio")
-    return _get_appropriate_storage_service_given_storage(chosen_storage)
+def get_appropriate_storage_service(
+    repoid: int | None = None,
+) -> BaseStorageService:
+    chosen_storage: str = get_config("services", "chosen_storage", default="minio")  # type: ignore
+    return _get_appropriate_storage_service_given_storage(chosen_storage, repoid)
 
 
 def _get_appropriate_storage_service_given_storage(
-    chosen_storage: str,
+    chosen_storage: str, repoid: int | None
 ) -> BaseStorageService:
     if chosen_storage == "gcp":
         gcp_config = get_config("services", "gcp", default={})
@@ -28,4 +32,6 @@ def _get_appropriate_storage_service_given_storage(
         return StorageWithFallbackService(gcp_service, aws_service)
     else:
         minio_config = get_config("services", "minio", default={})
+        if repoid and USE_NEW_MINIO.check_value(repoid, default=False):
+            return NewMinioStorageService(minio_config)
         return MinioStorageService(minio_config)
diff --git a/shared/storage/base.py b/shared/storage/base.py
@@ -1,6 +1,7 @@
 from typing import BinaryIO, overload
 
 CHUNK_SIZE = 1024 * 32
+PART_SIZE = 1024 * 1024 * 20  # 20MiB
 
 
 # Interface class for interfacing with codecov's underlying storage layer
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@

		BUNDLE_THRESHOLD_FLAG = Feature("bundle_threshold_flag")
		INCLUDE_GITHUB_COMMENT_ACTIONS_BY_OWNER = Feature("include_github_comment_actions")
		USE_NEW_MINIO = Feature("use_new_minio")