Skip to content

Commit

Permalink
fix(experiments): Calculate win probability against control (#27804)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
danielbachhuber and github-actions[bot] authored Jan 29, 2025
1 parent 3397f68 commit d28991b
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 37 deletions.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 12 additions & 2 deletions frontend/src/scenes/experiments/experimentLogic.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1311,8 +1311,15 @@ export const experimentLogic = kea<experimentLogicType>([
},
],
significanceDetails: [
(s) => [s.metricResults],
(metricResults: (CachedExperimentFunnelsQueryResponse | CachedExperimentTrendsQueryResponse | null)[]) =>
(s) => [s.metricResults, s.experimentStatsVersion],
(
metricResults: (
| CachedExperimentFunnelsQueryResponse
| CachedExperimentTrendsQueryResponse
| null
)[],
experimentStatsVersion: number
) =>
(metricIndex: number = 0): string => {
const results = metricResults?.[metricIndex]

Expand All @@ -1329,6 +1336,9 @@ export const experimentLogic = kea<experimentLogicType>([
}

if (results?.significance_code === ExperimentSignificanceCode.LowWinProbability) {
if (experimentStatsVersion === 2) {
return 'This is because no variant (control or test) has a win probability higher than 90%.'
}
return 'This is because the win probability of all test variants combined is less than 90%.'
}

Expand Down
22 changes: 16 additions & 6 deletions posthog/hogql_queries/experiments/funnels_statistics_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def calculate_probabilities_v2(
Returns:
--------
list[float]
A list of probabilities that sum to 1, where:
- The first element is the probability that the control variant is the best
- Subsequent elements are the probabilities that each test variant is the best
A list of probabilities where each element represents:
- index 0: probability control beats the best test variant
- index i>0: probability test variant i-1 beats control
Notes:
------
Expand Down Expand Up @@ -70,10 +70,20 @@ def calculate_probabilities_v2(
samples.append(variant_samples)

samples_array = np.array(samples)
# Calculate probability of each variant being the best
probabilities = []
for i in range(len(all_variants)):
probability = (samples_array[i] == np.max(samples_array, axis=0)).mean()
control_samples = samples_array[0] # Control is always first variant

# Find the best test variant at each sample point
test_variants_samples = samples_array[1:]
best_variant_samples = np.max(test_variants_samples, axis=0)

# Control's probability is of being better than the best test variant
control_prob = np.mean(control_samples >= best_variant_samples)
probabilities.append(float(control_prob))

# For each test variant, calculate probability of beating control
for i in range(1, len(all_variants)):
probability = np.mean(samples_array[i] > control_samples)
probabilities.append(float(probability))

return probabilities
Expand Down
29 changes: 29 additions & 0 deletions posthog/hogql_queries/experiments/test/test_funnels_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,35 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_many_variants_win_probabilty_compared_to_control(self):
"""Test with multiple variants, win probability compared to control"""

def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
# test_a is worse than control
# test_b is best overall
# test_c is slightly better than control
control = create_variant("control", success_count=100, failure_count=900) # 10% conversion
test_a = create_variant("test_a", success_count=80, failure_count=920) # 8% conversion
test_b = create_variant("test_b", success_count=150, failure_count=850) # 15% conversion
test_c = create_variant("test_c", success_count=110, failure_count=890) # 11% conversion

probabilities = calculate_probabilities(control, [test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0.05, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.76, delta=0.05)
else:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_insufficient_sample_size(self):
"""Test with sample size below threshold"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,9 +343,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertTrue(probabilities[2] > 0.9)
self.assertTrue(probabilities[1] < 0.1)
self.assertTrue(probabilities[0] < 0.1)
self.assertTrue(probabilities[1] > 0.9)
self.assertTrue(probabilities[2] > 0.9)
self.assertTrue(probabilities[3] > 0.9)

self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
self.assertEqual(p_value, 0)

Expand Down Expand Up @@ -389,6 +391,55 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_many_variants_win_probabilty_compared_to_control(self):
"""Test with multiple variants, win probability compared to control"""

def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
control_absolute_exposure = 1000
control = create_variant(
"control",
total_sum=100.0 * control_absolute_exposure,
exposure=1,
absolute_exposure=control_absolute_exposure,
)
test_a_absolute_exposure = 1000
test_a = create_variant(
"test_a",
total_sum=85.0 * test_a_absolute_exposure,
exposure=test_a_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_a_absolute_exposure,
)
test_b_absolute_exposure = 1000
test_b = create_variant(
"test_b",
total_sum=150.0 * test_b_absolute_exposure,
exposure=test_b_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_b_absolute_exposure,
)
test_c_absolute_exposure = 1000
test_c = create_variant(
"test_c",
total_sum=110.0 * test_c_absolute_exposure,
exposure=test_c_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_c_absolute_exposure,
)
probabilities = calculate_probabilities(control, [test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0.05, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.99, delta=0.05)
else:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_insufficient_sample_size(self):
"""Test with sample size below threshold"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca
intervals = calculate_credible_intervals([control, test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
self.assertTrue(probabilities[2] > 0.9) # test_b should be winning
self.assertTrue(probabilities[1] < 0.1) # test_a should be losing
self.assertTrue(probabilities[0] < 0.1) # control should be losing
self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
if stats_version == 2:
self.assertTrue(probabilities[0] < 0.1) # control is losing
self.assertTrue(probabilities[1] > 0.7) # test_a beats control, but less confidently
self.assertTrue(probabilities[2] > 0.9) # test_b beats control
self.assertTrue(probabilities[3] > 0.9) # test_c beats control
self.assertEqual(p_value, 0)
else:
self.assertTrue(probabilities[0] < 0.1) # control should be losing
self.assertTrue(probabilities[1] < 0.1) # test_a should be losing
self.assertTrue(probabilities[2] > 0.9) # test_b should be winning
self.assertTrue(probabilities[3] < 0.1) # test_c should be losing
self.assertLess(p_value, 0.001)

# Control at 10%
Expand All @@ -274,6 +279,51 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_many_variants_win_probabilty_compared_to_control(self):
"""Test with multiple variants, win probability compared to control"""

def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
control_absolute_exposure = 1000
control = create_variant("control", count=100, exposure=1, absolute_exposure=control_absolute_exposure)
test_a_absolute_exposure = 1000
test_a = create_variant(
"test_a",
count=85,
exposure=test_a_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_a_absolute_exposure,
)
test_b_absolute_exposure = 1000
test_b = create_variant(
"test_b",
count=150,
exposure=test_b_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_b_absolute_exposure,
)
test_c_absolute_exposure = 1000
test_c = create_variant(
"test_c",
count=110,
exposure=test_c_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_c_absolute_exposure,
)

probabilities = calculate_probabilities(control, [test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0.13, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.75, delta=0.05)
else:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_real_world_data_1(self):
"""Test with multiple variants, one clear winner"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,9 @@ def calculate_probabilities_v2_continuous(
Returns:
--------
list[float]
A list of probabilities where each element represents the probability that the
corresponding variant is the best (has highest mean value) among all variants:
- index 0: probability control variant is best
- index i>0: probability test variant i-1 is best
All probabilities sum to 1.0
A list of probabilities where each element represents:
- index 0: probability control variant beats the best test variant
- index i>0: probability test variant i-1 beats control
Notes:
------
Expand Down Expand Up @@ -118,16 +116,14 @@ def calculate_probabilities_v2_continuous(
# Calculate probabilities
probabilities = []

# Probability control wins (beats all test variants)
control_wins = np.all([samples_control > test_sample for test_sample in test_samples], axis=0)
# Probability control wins (beats the best test variant)
best_test_samples = np.max(test_samples, axis=0) # Get best test variant at each sample point
control_wins = samples_control > best_test_samples
probabilities.append(float(np.mean(control_wins)))

# Probability each test variant wins (beats control and all other test variants)
for i, test_sample in enumerate(test_samples):
other_test_samples = test_samples[:i] + test_samples[i + 1 :]
variant_wins = np.all(
[test_sample > samples_control] + [test_sample > other for other in other_test_samples], axis=0
)
# Probability each test variant wins (beats control only)
for test_sample in test_samples:
variant_wins = test_sample > samples_control
probabilities.append(float(np.mean(variant_wins)))

return probabilities
Expand Down
20 changes: 9 additions & 11 deletions posthog/hogql_queries/experiments/trends_statistics_v2_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def calculate_probabilities_v2_count(
Returns:
--------
list[float]
A list of probabilities that sum to 1, where:
- The first element is the probability that the control variant is the best
- Subsequent elements are the probabilities that each test variant is the best
A list of probabilities where each element represents:
- index 0: probability control variant beats the best test variant
- index i>0: probability test variant i-1 beats control
Notes:
------
Expand Down Expand Up @@ -78,16 +78,14 @@ def calculate_probabilities_v2_count(
# Calculate probabilities
probabilities = []

# Probability control wins (beats all test variants)
control_wins = np.all([samples_control > test_sample for test_sample in test_samples], axis=0)
# Probability control wins (beats the best test variant)
best_test_samples = np.max(test_samples, axis=0)
control_wins = samples_control > best_test_samples
probabilities.append(float(np.mean(control_wins)))

# Probability each test variant wins (beats control and all other test variants)
for i, test_sample in enumerate(test_samples):
other_test_samples = test_samples[:i] + test_samples[i + 1 :]
variant_wins = np.all(
[test_sample > samples_control] + [test_sample > other for other in other_test_samples], axis=0
)
# Probability each test variant wins (beats control only)
for test_sample in test_samples:
variant_wins = test_sample > samples_control
probabilities.append(float(np.mean(variant_wins)))

return probabilities
Expand Down

0 comments on commit d28991b

Please sign in to comment.