diff --git a/setup.py b/setup.py index 31c0ca6b0..c4961b960 100644 --- a/setup.py +++ b/setup.py @@ -35,5 +35,6 @@ "analytics-python==1.3.0b1", "httpx>=0.23.0", "oauthlib", + "redis", ], ) diff --git a/shared/helpers/cache.py b/shared/helpers/cache.py new file mode 100644 index 000000000..ab2f7b06b --- /dev/null +++ b/shared/helpers/cache.py @@ -0,0 +1,230 @@ +import asyncio +import base64 +import hashlib +import logging +import pickle +from functools import wraps +from typing import Any, Callable, Hashable + +from redis import Redis, RedisError + +from shared.metrics import metrics + +log = logging.getLogger(__name__) + +NO_VALUE = object() + +DEFAULT_TTL = 120 + + +def make_hash_sha256(o: Any) -> str: + """Provides a machine-independent, consistent hash value for any object + + Args: + o (Any): Any object we want + + Returns: + str: a sha256-based hash that is always the same for the same object + """ + hasher = hashlib.sha256() + hasher.update(repr(make_hashable(o)).encode()) + return base64.b64encode(hasher.digest()).decode() + + +def make_hashable(o: Any) -> Hashable: + """ + Converts any object into an object that will have a consistent hash + """ + if isinstance(o, (tuple, list)): + return tuple((make_hashable(e) for e in o)) + if isinstance(o, dict): + return tuple(sorted((k, make_hashable(v)) for k, v in o.items())) + if isinstance(o, (set, frozenset)): + return tuple(sorted(make_hashable(e) for e in o)) + return o + + +class BaseBackend(object): + """ + This is the interface a class needs to honor in order to work as a backend. + + The only two needed functions are `get` and `set`, which will fetch information from the + cache and send information to it, respectively. + + However the cache wants to work internally, it's their choice. They only need to be able to + `set` and `get` without raising any exceptions + """ + + def get(self, key: str) -> Any: + """Returns a cached value from the cache, or NO_VALUE, if no cache is set for that key + + Args: + key (str): The key that represents the objecr + + Returns: + Any: The object that is possibly cached, or NO_VALUE, if no cache was there + """ + raise NotImplementedError() + + def set(self, key: str, ttl: int, value: Any): + raise NotImplementedError() + + +class NullBackend(BaseBackend): + """ + This is the default implementation of BaseBackend that is used. + + It essentially `gets` as if nothing is cached, and does not cache anything when requested + to. + + This makes the cache virtually transparent. It acts as if no cache was there + """ + + def get(self, key: str) -> Any: + return NO_VALUE + + def set(self, key: str, ttl: int, value: Any): + pass + + +class RedisBackend(BaseBackend): + + current_protocol = pickle.DEFAULT_PROTOCOL + + def __init__(self, redis_connection: Redis): + self.redis_connection = redis_connection + + def get(self, key: str) -> Any: + try: + serialized_value = self.redis_connection.get(key) + except RedisError: + log.warning("Unable to fetch from cache on redis", exc_info=True) + return NO_VALUE + if serialized_value is None: + return NO_VALUE + try: + return pickle.loads(serialized_value) + except ValueError: + return NO_VALUE + + def set(self, key: str, ttl: int, value: Any): + serialized_value = pickle.dumps(value, self.current_protocol) + try: + self.redis_connection.setex(key, ttl, serialized_value) + except RedisError: + log.warning("Unable to set cache on redis", exc_info=True) + + +class OurOwnCache(object): + """ + This is codecov distributed cache's implementation. + + The tldr to use it is, given a function f: + + ``` + from helpers.cache import cache + + @cache.cache_function() + def f(...): + ... + ``` + + Now to explain its internal workings. + + This is a configurable-at-runtime cache. Its whole idea is based on the fact that it does + not need information at import-time. This allows us to use it transparently and still + not have to change tests, for example, due to it. All tests occur as if the cache was + not there. + + All that is needed to configure the backend is to do + + ``` + cache.configure(any_backend) + ``` + + which we currently do at `worker_process_init` time with a RedisBackend instance. Other + instances can be plugged in easily, once needed. A backend is any implementation + of `BaseBackend`, which is described at their docstrings. + + When `cache.cache_function()` is called, a `FunctionCacher` is returned. They do the heavy + lifting of actually decorating the function properly, dealign with sync-async context. + + """ + + def __init__(self): + self._backend = NullBackend() + self._app = "not_configured" + + def configure(self, backend: BaseBackend, app: str = "shared"): + self._backend = backend + self._app = app + + def get_backend(self) -> BaseBackend: + return self._backend + + def cache_function(self, ttl: int = DEFAULT_TTL) -> "FunctionCacher": + """Creates a FunctionCacher with all the needed configuration to cache a function + + Args: + ttl (int, optional): The time-to-live of the cache + + Returns: + FunctionCacher: A FunctionCacher that can decorate any callable + """ + return FunctionCacher(self, ttl) + + +class FunctionCacher(object): + def __init__(self, cache_instance: OurOwnCache, ttl: int): + self.cache_instance = cache_instance + self.ttl = ttl + + def __call__(self, func) -> Callable: + if asyncio.iscoroutinefunction(func): + return self.cache_async_function(func) + return self.cache_synchronous_function(func) + + def cache_synchronous_function(self, func: Callable) -> Callable: + @wraps(func) + def wrapped(*args, **kwargs): + key = self.generate_key(func, args, kwargs) + value = self.cache_instance.get_backend().get(key) + if value is not NO_VALUE: + metrics.incr(f"{self.cache_instance._app}.caches.{func.__name__}.hits") + return value + metrics.incr(f"{self.cache_instance._app}.caches.{func.__name__}.misses") + with metrics.timer( + f"{self.cache_instance._app}.caches.{func.__name__}.runtime" + ): + result = func(*args, **kwargs) + self.cache_instance.get_backend().set(key, self.ttl, result) + return result + + return wrapped + + def generate_key(self, func, args, kwargs) -> str: + func_name = make_hash_sha256(func.__name__) + tupled_args = make_hash_sha256(args) + frozen_kwargs = make_hash_sha256(kwargs) + return ":".join(["cache", func_name, tupled_args, frozen_kwargs]) + + def cache_async_function(self, func: Callable) -> Callable: + @wraps(func) + async def wrapped(*args, **kwargs): + key = self.generate_key(func, args, kwargs) + value = self.cache_instance.get_backend().get(key) + if value is not NO_VALUE: + metrics.incr(f"{self.cache_instance._app}.caches.{func.__name__}.hits") + return value + metrics.incr(f"{self.cache_instance._app}.caches.{func.__name__}.misses") + with metrics.timer( + f"{self.cache_instance._app}.caches.{func.__name__}.runtime" + ): + result = await func(*args, **kwargs) + self.cache_instance.get_backend().set(key, self.ttl, result) + return result + + return wrapped + + +cache = OurOwnCache() diff --git a/tests/unit/helpers/test_cache.py b/tests/unit/helpers/test_cache.py new file mode 100644 index 000000000..34c8277bd --- /dev/null +++ b/tests/unit/helpers/test_cache.py @@ -0,0 +1,151 @@ +import pickle + +import pytest +from redis.exceptions import TimeoutError + +from shared.helpers.cache import ( + NO_VALUE, + BaseBackend, + OurOwnCache, + RedisBackend, + make_hash_sha256, +) + + +class RandomCounter(object): + def __init__(self): + self.value = 0 + + def call_function(self): + self.value += 1 + return self.value + + async def async_call_function(self): + self.value += 2 + self.value *= 4 + return self.value + + +class FakeBackend(BaseBackend): + def __init__(self): + self.all_keys = {} + + def get(self, key): + possible_values = self.all_keys.get(key, {}) + for ttl, val in possible_values.items(): + return val + return NO_VALUE + + def set(self, key, ttl, value): + if key not in self.all_keys: + self.all_keys[key] = {} + self.all_keys[key][ttl] = value + + +class FakeRedis(object): + def __init__(self): + self.all_keys = {} + + def get(self, key): + return self.all_keys.get(key) + + def setex(self, key, expire, value): + self.all_keys[key] = value + + +class FakeRedisWithIssues(object): + def get(self, key): + raise TimeoutError() + + def setex(self, key, expire, value): + raise TimeoutError() + + +class TestRedisBackend(object): + def test_simple_redis_call(self): + redis_backend = RedisBackend(FakeRedis()) + assert redis_backend.get("normal_key") == NO_VALUE + redis_backend.set("normal_key", 120, {"value_1": set("ascdefgh"), 1: [1, 3]}) + assert redis_backend.get("normal_key") == { + "value_1": set("ascdefgh"), + 1: [1, 3], + } + + def test_simple_redis_call_invalid_pickle_version(self): + redis_instance = FakeRedis() + # PICKLE HERE WILL BE SET TO VERSION 9 (\x09 in the second byte of the value) + # IF THIS STOPS FAILING WITH ValueError, CHANGE THE SECOND BYTE TO SOMETHING HIGHER + redis_instance.setex("key", 120, b"\x80\x09X\x05\x00\x00\x00valueq\x00.") + redis_backend = RedisBackend(redis_instance) + assert redis_backend.get("key") == NO_VALUE + + def test_simple_redis_call_exception(self): + redis_backend = RedisBackend(FakeRedisWithIssues()) + assert redis_backend.get("normal_key") == NO_VALUE + redis_backend.set("normal_key", 120, {"value_1": set("ascdefgh"), 1: [1, 3]}) + assert redis_backend.get("normal_key") == NO_VALUE + + +class TestCache(object): + def test_simple_caching_no_backend_no_params(self, mocker): + cache = OurOwnCache() + sample_function = RandomCounter().call_function + cached_function = cache.cache_function()(sample_function) + assert cached_function() == 1 + assert cached_function() == 2 + assert cached_function() == 3 + + def test_simple_caching_no_backend_no_params_with_ttl(self, mocker): + cache = OurOwnCache() + sample_function = RandomCounter().call_function + cached_function = cache.cache_function(ttl=300)(sample_function) + assert cached_function() == 1 + assert cached_function() == 2 + assert cached_function() == 3 + + @pytest.mark.asyncio + async def test_simple_caching_no_backend_async_no_params(self, mocker): + cache = OurOwnCache() + assert cache._app == "not_configured" + sample_function = RandomCounter().async_call_function + cached_function = cache.cache_function()(sample_function) + assert (await cached_function()) == 8 + assert (await cached_function()) == 40 + assert (await cached_function()) == 168 + + def test_simple_caching_fake_backend_no_params(self, mocker): + cache = OurOwnCache() + cache.configure(FakeBackend()) + assert cache._app == "shared" + sample_function = RandomCounter().call_function + cached_function = cache.cache_function()(sample_function) + assert cached_function() == 1 + assert cached_function() == 1 + assert cached_function() == 1 + + @pytest.mark.asyncio + async def test_simple_caching_fake_backend_async_no_params(self, mocker): + cache = OurOwnCache() + cache.configure(FakeBackend(), app="worker") + assert cache._app == "worker" + sample_function = RandomCounter().async_call_function + cached_function = cache.cache_function()(sample_function) + assert (await cached_function()) == 8 + assert (await cached_function()) == 8 + assert (await cached_function()) == 8 + + @pytest.mark.asyncio + async def test_make_hash_sha256(self): + assert make_hash_sha256(1) == "a4ayc/80/OGda4BO/1o/V0etpOqiLx1JwB5S3beHW0s=" + assert ( + make_hash_sha256("somestring") + == "l5nfZJ7iQAll9QGKjGm4wPuSgUoikOMrdpOw/36GLyw=" + ) + this_set = set(["1", "something", "True", "another_string_of_values"]) + assert ( + make_hash_sha256(this_set) == "siFp5vd4+aI5SxlURDMV3Z5Yfn5qnpSbCctIewE6m44=" + ) + this_set.add("ooops") + assert ( + make_hash_sha256(this_set) == "aoU2Of3YNk0/iW1hqfSkXPbhIAzGMHCSCoxsiLI2b8U=" + )