Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(flags): add quota limiting for feature flags served up with /decide and to local_evaluation #28564

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e5b00e8
ruff
dmarticus Feb 11, 2025
03f877c
import constants
dmarticus Feb 11, 2025
20d7c14
Merge branch 'master' into feat/quota-limiting
dmarticus Feb 11, 2025
2ed2be7
add feature flags to these quota limit assertions
dmarticus Feb 11, 2025
0dd8563
Merge branch 'feat/quota-limiting' of github.com:PostHog/posthog into…
dmarticus Feb 11, 2025
51f0df5
darnit i missed one
dmarticus Feb 11, 2025
b428fa8
two more bugs I fixed wow go me
dmarticus Feb 11, 2025
290d681
Merge branch 'master' into feat/quota-limiting
dmarticus Feb 11, 2025
727c4cf
fixing some tests
dmarticus Feb 11, 2025
3836ffe
Merge branch 'feat/quota-limiting' of github.com:PostHog/posthog into…
dmarticus Feb 11, 2025
62b09c1
sigh idk
dmarticus Feb 11, 2025
b9c38d1
Merge branch 'master' into feat/quota-limiting
dmarticus Feb 12, 2025
fef5963
Merge branch 'master' into feat/quota-limiting
dmarticus Feb 12, 2025
ca1037b
Merge branch 'feat/quota-limiting' of github.com:PostHog/posthog into…
dmarticus Feb 12, 2025
e980e11
cleaning up quota limiting + remote config
dmarticus Feb 12, 2025
cd37a8b
more response polishing
dmarticus Feb 12, 2025
1d662c9
Merge branch 'master' into feat/quota-limiting
dmarticus Feb 12, 2025
7d82a58
delete prints
dmarticus Feb 12, 2025
96f59ae
moar tests
dmarticus Feb 12, 2025
efd76bc
Merge branch 'master' into feat/quota-limiting
dmarticus Feb 12, 2025
9f06adf
limit local eval, too
dmarticus Feb 13, 2025
12c13c8
Merge branch 'feat/quota-limiting' of github.com:PostHog/posthog into…
dmarticus Feb 13, 2025
a176607
renamed the quota limit to make it more clear
dmarticus Feb 14, 2025
f1eab3f
respond with an error instead; this feels more honest
dmarticus Feb 14, 2025
641988e
resolvin them conflicts
dmarticus Feb 14, 2025
1f98d8b
Update query snapshots
github-actions[bot] Feb 14, 2025
14e4312
resolve conflicts
dmarticus Feb 14, 2025
d783b42
Update query snapshots
github-actions[bot] Feb 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ee/billing/quota_limiting.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class QuotaResource(Enum):
EVENTS = "events"
RECORDINGS = "recordings"
ROWS_SYNCED = "rows_synced"
FEATURE_FLAGS = "feature_flags"


class QuotaLimitingCaches(Enum):
Expand All @@ -56,6 +57,7 @@ class QuotaLimitingCaches(Enum):
QuotaResource.EVENTS: 0,
QuotaResource.RECORDINGS: 1000,
QuotaResource.ROWS_SYNCED: 0,
QuotaResource.FEATURE_FLAGS: 0, # TODO: should we have a buffer here?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm open to calling this feature_flags_evaluated instead, that might be more clear (I like how rows_synced is more obvious than just events.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think FEATURE_FLAGS_EVALUATED makes sense. That way it's not confused with how many feature flags a team has, which I assume is NOT quota limited?

}


Expand Down
17 changes: 17 additions & 0 deletions ee/billing/test/test_quota_limiting.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,3 +814,20 @@ def test_get_team_attribute_by_quota_resource(self, mock_capture):
f"quota_limiting: No team tokens found for organization: {self.organization.id}",
str(mock_capture.call_args[0][0]),
)

def test_feature_flags_quota_limiting(self):
with self.settings(USE_TZ=False), freeze_time("2021-01-25T00:00:00Z"):
self.organization.usage = {
"events": {"usage": 10, "limit": 100},
"recordings": {"usage": 10, "limit": 100},
"feature_flags": {"usage": 110, "limit": 100},
"period": ["2021-01-01T00:00:00Z", "2021-01-31T23:59:59Z"],
}
self.organization.customer_trust_scores = {"events": 0, "recordings": 0, "feature_flags": 0}
self.organization.save()

quota_limited_orgs, quota_limiting_suspended_orgs = update_all_orgs_billing_quotas()

assert self.team.api_token.encode("UTF-8") in self.redis_client.zrange(
f"@posthog/quota-limits/feature_flags", 0, -1
)
dmarticus marked this conversation as resolved.
Show resolved Hide resolved
251 changes: 137 additions & 114 deletions posthog/api/decide.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from random import random
from typing import Any, Union
from typing import Any

import structlog
from django.conf import settings
Expand Down Expand Up @@ -166,11 +166,16 @@ def get_base_config(token: str, team: Team, request: HttpRequest, skip_db: bool
@csrf_exempt
@timed("posthog_cloud_decide_endpoint")
def get_decide(request: HttpRequest):
# handle cors request
"""Handle the /decide endpoint which provides configuration and feature flags to PostHog clients.
The decide endpoint is a critical API that tells PostHog clients (like posthog-js) how they should behave,
including which feature flags are enabled, whether to record sessions, etc.
"""
# --- 1. Handle non-POST requests ---
if request.method == "OPTIONS":
return cors_response(request, JsonResponse({"status": 1}))

if request.method != "POST":
# Return minimal default configuration for non-POST requests
statsd.incr(f"posthog_cloud_raw_endpoint_success", tags={"endpoint": "decide"})
return cors_response(
request,
Expand All @@ -179,51 +184,39 @@ def get_decide(request: HttpRequest):
"config": {"enable_collect_everything": True},
"toolbarParams": {},
"isAuthenticated": False,
# gzip and gzip-js are aliases for the same compression algorithm
"supportedCompression": ["gzip", "gzip-js"],
"featureFlags": [],
"sessionRecording": False,
}
),
)

# --- 2. Parse request data and API version ---
try:
data = load_data_from_request(request)
api_version_string = request.GET.get("v")
# NOTE: This does not support semantic versioning e.g. 2.1.0
api_version = int(api_version_string) if api_version_string else 1
except ValueError:
# default value added because of bug in posthog-js 1.19.0
# see https://sentry.io/organizations/posthog2/issues/2738865125/?project=1899813
# as a tombstone if the below statsd counter hasn't seen errors for N days
# then it is likely that no clients are running posthog-js 1.19.0
# and this defaulting could be removed
# Handle legacy clients (posthog-js 1.19.0) by defaulting to v2
statsd.incr(
f"posthog_cloud_decide_defaulted_api_version_on_value_error",
tags={"endpoint": "decide", "api_version_string": api_version_string},
)
api_version = 2
except UnspecifiedCompressionFallbackParsingError as error:
# Notably don't capture this exception as it's not caused by buggy behavior,
# it's just a fallback for when we can't parse the request due to a missing header
# that we attempted to kludge by manually setting the compression type to gzip
# If this kludge fails, though all we need to do is return a 400 and move on
return cors_response(
request,
generate_exception_response("decide", f"Malformed request data: {error}", code="malformed_data"),
)
except RequestParsingError as error:
capture_exception(error) # We still capture this on Sentry to identify actual potential bugs
except (UnspecifiedCompressionFallbackParsingError, RequestParsingError) as error:
# Return error for malformed requests
return cors_response(
request,
generate_exception_response("decide", f"Malformed request data: {error}", code="malformed_data"),
)

# --- 3. Authenticate the request ---
token = get_token(data, request)
team = Team.objects.get_team_from_cache_or_token(token)

# Handle personal API key authentication if team lookup failed
if team is None and token:
project_id = get_project_id(data, request)

if not project_id:
return cors_response(
request,
Expand All @@ -250,122 +243,42 @@ def get_decide(request: HttpRequest):
)
team = user.teams.get(id=project_id)

is_request_sampled_for_logging = random() < settings.DECIDE_REQUEST_LOGGING_SAMPLING_RATE
# --- 4. Process authenticated requests ---
if team:
# Set up logging and context
is_request_sampled_for_logging = random() < settings.DECIDE_REQUEST_LOGGING_SAMPLING_RATE
if is_request_sampled_for_logging:
logger.warn(
"DECIDE_REQUEST_STARTED",
team_id=team.id,
distinct_id=data.get("distinct_id", None),
)
logger.warn("DECIDE_REQUEST_STARTED", team_id=team.id, distinct_id=data.get("distinct_id"))

# Check if team is allowed to use decide
if team.id in settings.DECIDE_SHORT_CIRCUITED_TEAM_IDS:
return cors_response(
request,
generate_exception_response(
"decide",
f"Team with ID {team.id} cannot access the /decide endpoint."
f"Please contact us at [email protected]",
f"Team with ID {team.id} cannot access the /decide endpoint. Please contact us at [email protected]",
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
),
)

token = team.api_token

structlog.contextvars.bind_contextvars(team_id=team.id)

# --- 5. Handle feature flags ---
flags_response = {}
disable_flags = process_bool(data.get("disable_flags")) is True
feature_flags = None
errors = False
flags_response: dict[str, Any] = {}

if not disable_flags:
distinct_id = data.get("distinct_id")
if distinct_id is None:
return cors_response(
request,
generate_exception_response(
"decide",
"Decide requires a distinct_id.",
code="missing_distinct_id",
type="validation_error",
status_code=status.HTTP_400_BAD_REQUEST,
),
)
else:
distinct_id = str(distinct_id)

property_overrides = {}
geoip_enabled = process_bool(data.get("geoip_disable")) is False

if geoip_enabled:
property_overrides = get_geoip_properties(get_ip_address(request))

all_property_overrides: dict[str, Union[str, int]] = {
**property_overrides,
**(data.get("person_properties") or {}),
}

feature_flags, _, feature_flag_payloads, errors = get_all_feature_flags(
team.pk,
distinct_id,
data.get("groups") or {},
hash_key_override=data.get("$anon_distinct_id"),
property_value_overrides=all_property_overrides,
group_property_value_overrides=(data.get("group_properties") or {}),
flags_response = compute_feature_flags(
request, data, team, token, api_version, is_request_sampled_for_logging
)

active_flags = {key: value for key, value in feature_flags.items() if value}

if api_version == 2:
flags_response["featureFlags"] = active_flags
elif api_version >= 3:
# v3 returns all flags, not just active ones, as well as if there was an error computing all flags
flags_response["featureFlags"] = feature_flags
flags_response["errorsWhileComputingFlags"] = errors
flags_response["featureFlagPayloads"] = feature_flag_payloads
else:
# default v1
flags_response["featureFlags"] = list(active_flags.keys())

# metrics for feature flags
team_id_label = label_for_team_id_to_track(team.pk)
FLAG_EVALUATION_COUNTER.labels(
team_id=team_id_label,
errors_computing=errors,
has_hash_key_override=bool(data.get("$anon_distinct_id")),
).inc()

if is_request_sampled_for_logging:
logger.warn(
"DECIDE_REQUEST_SUCCEEDED",
team_id=team.id,
distinct_id=distinct_id,
errors_while_computing=errors or False,
has_hash_key_override=bool(data.get("$anon_distinct_id")),
)
else:
flags_response["featureFlags"] = {}
Comment on lines -318 to -348
Copy link
Contributor Author

@dmarticus dmarticus Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this code was all moved to get_feature_flags_response

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish GitHub supported semantic diffs to make it easier to review changes like this. 😆

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, that'd be so nice. Sometimes I find the side-by-side view to help with it.


# NOTE: Changed code - everything not feature flags goes in here
response = get_base_config(token, team, request, skip_db=errors)
# --- 6. Build and return full response from the base config and the flags response ---
response = get_base_config(token, team, request, skip_db=flags_response.get("errorsWhileComputingFlags", False))
response.update(flags_response)

# NOTE: Whenever you add something to decide response, update this test:
# `test_decide_doesnt_error_out_when_database_is_down`
# which ensures that decide doesn't error out when the database is down

if feature_flags:
# Billing analytics for decide requests with feature flags
# Don't count if all requests are for survey targeting flags only.
if not all(flag.startswith(SURVEY_TARGETING_FLAG_PREFIX) for flag in feature_flags.keys()):
# Sample no. of decide requests with feature flags
if settings.DECIDE_BILLING_SAMPLING_RATE and random() < settings.DECIDE_BILLING_SAMPLING_RATE:
count = int(1 / settings.DECIDE_BILLING_SAMPLING_RATE)
increment_request_count(team.pk, count)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this code was also moved to compute_feature_flags, since it's only relevant if we actually evaluated flags.

Comment on lines -358 to -365
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this, and other logging, is moved into _record_feature_flag_metrics


else:
# no auth provided
# No valid authentication provided
return cors_response(
request,
generate_exception_response(
Expand All @@ -381,6 +294,116 @@ def get_decide(request: HttpRequest):
return cors_response(request, JsonResponse(response))


def compute_feature_flags(request, data, team, token, api_version, is_request_sampled_for_logging):
"""Compute feature flags for the given request. Extracted from get_decide for clarity."""
flags_response: dict[str, Any] = {}

# --- 1. Validate distinct_id ---
distinct_id = data.get("distinct_id")
if distinct_id is None:
return cors_response(
request,
generate_exception_response(
"decide",
"Decide requires a distinct_id.",
code="missing_distinct_id",
type="validation_error",
status_code=status.HTTP_400_BAD_REQUEST,
),
)
distinct_id = str(distinct_id)

# --- 2. Check feature flag quota limits ---
if settings.DECIDE_FEATURE_FLAG_QUOTA_CHECK:
from ee.billing.quota_limiting import (
QuotaLimitingCaches,
QuotaResource,
list_limited_team_attributes,
)

limited_tokens_flags = list_limited_team_attributes(
QuotaResource.FEATURE_FLAGS, QuotaLimitingCaches.QUOTA_LIMITER_CACHE_KEY
)

if token in limited_tokens_flags:
flags_response.update(
{
"quotaLimited": ["feature_flags"],
"featureFlags": {},
"errorsWhileComputingFlags": True,
"featureFlagPayloads": {},
}
)
return flags_response

# --- 3. Gather property overrides ---
property_overrides = {}
if process_bool(data.get("geoip_disable")) is False:
property_overrides = get_geoip_properties(get_ip_address(request))

all_property_overrides = {
**property_overrides,
**(data.get("person_properties") or {}),
}

# --- 4. Compute feature flags ---
feature_flags, _, feature_flag_payloads, errors = get_all_feature_flags(
team.pk,
distinct_id,
data.get("groups") or {},
hash_key_override=data.get("$anon_distinct_id"),
property_value_overrides=all_property_overrides,
group_property_value_overrides=(data.get("group_properties") or {}),
)

# --- 5. Format response based on API version ---
active_flags = {key: value for key, value in feature_flags.items() if value}

if api_version == 2:
# v2: Return only active flags as a dict
flags_response["featureFlags"] = active_flags
elif api_version >= 3:
# v3: Return all flags, error status, and payloads
flags_response.update(
{
"featureFlags": feature_flags,
"errorsWhileComputingFlags": errors,
"featureFlagPayloads": feature_flag_payloads,
}
)
else:
# v1: Return active flag keys as a list
flags_response["featureFlags"] = list(active_flags.keys())

# --- 6. Record metrics and logs ---
# Track feature flag evaluation metrics
team_id_label = label_for_team_id_to_track(team.pk)
FLAG_EVALUATION_COUNTER.labels(
team_id=team_id_label,
errors_computing=errors,
has_hash_key_override=bool(data.get("$anon_distinct_id")),
).inc()

if is_request_sampled_for_logging:
logger.warn(
"DECIDE_REQUEST_SUCCEEDED",
team_id=team.id,
distinct_id=distinct_id,
errors_while_computing=errors or False,
has_hash_key_override=bool(data.get("$anon_distinct_id")),
)

# --- 7. Handle billing analytics ---
if feature_flags:
# Only count non-survey targeting flags for billing
if not all(flag.startswith(SURVEY_TARGETING_FLAG_PREFIX) for flag in feature_flags.keys()):
if settings.DECIDE_BILLING_SAMPLING_RATE and random() < settings.DECIDE_BILLING_SAMPLING_RATE:
count = int(1 / settings.DECIDE_BILLING_SAMPLING_RATE)
increment_request_count(team.pk, count)

return flags_response


def _session_recording_domain_not_allowed(team: Team, request: HttpRequest) -> bool:
return team.recording_domains and not on_permitted_recording_domain(team.recording_domains, request)

Expand Down
Loading
Loading