Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ax/analysis/healthcheck/no_effects_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ def compute(
df_tone = check_experiment_effects_per_metric(
data=data, objective_names=set(objective_names)
)
if df_tone.empty:
raise UserInputError(
"TestOfNoEffectAnalysis requires at least one trial with two "
"or more arms, since the test compares arms within a trial."
)
metrics_tone = df_tone.groupby("metric_name")["has_effect"].sum() > 0
metrics_with_effects = [i for i in metrics_tone.index if metrics_tone[i]]
objectives_without_effects = set()
Expand Down
68 changes: 68 additions & 0 deletions ax/analysis/healthcheck/tests/test_no_effects_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,74 @@ def test_raises_error_without_data(self) -> None:
with self.assertRaises(AxError):
self.tone.compute(experiment=self.experiment)

def test_raises_error_without_sample_sizes(self) -> None:
# GIVEN an experiment whose data has no `n` column (as produced by
# standard trial evaluation, e.g. `Client.complete_trial`)
self.experiment.attach_data(
data=Data(
df=pd.DataFrame(
{
"arm_name": ["0_0", "0_1", "0_2"],
"metric_name": ["branin"] * 3,
"mean": [0.0, 1.0, 2.0],
"sem": [0.1] * 3,
"trial_index": [0] * 3,
"metric_signature": ["branin"] * 3,
}
)
)
)
# WHEN we compute the healthcheck
# THEN it raises a UserInputError instead of a KeyError
with self.assertRaisesRegex(UserInputError, "per-arm sample sizes"):
self.tone.compute(experiment=self.experiment)

def test_deterministic_data_detects_effects(self) -> None:
# GIVEN an experiment with deterministic data (sem == 0) and
# clearly different means
self.experiment.attach_data(
data=Data(
df=pd.DataFrame(
{
"arm_name": ["0_0", "0_1", "0_2"],
"metric_name": ["branin"] * 3,
"mean": [0.0, 1.0, 2.0],
"sem": [0.0] * 3,
"trial_index": [0] * 3,
"n": [1000] * 3,
"metric_signature": ["branin"] * 3,
}
)
)
)
# WHEN we compute the healthcheck
card = self.tone.compute(experiment=self.experiment)
# THEN it is a PASS (previously a NaN p-value silently read as
# "no effect" and produced a WARNING)
self.assertEqual(card.get_status(), HealthcheckStatus.PASS)

def test_raises_error_with_only_single_arm_trials(self) -> None:
# GIVEN an experiment whose trials all have a single arm
self.experiment.attach_data(
data=Data(
df=pd.DataFrame(
{
"arm_name": ["0_0"],
"metric_name": ["branin"],
"mean": [1.0],
"sem": [0.1],
"trial_index": [0],
"n": [1000],
"metric_signature": ["branin"],
}
)
)
)
# WHEN we compute the healthcheck
# THEN it raises a UserInputError
with self.assertRaisesRegex(UserInputError, "two or more arms"):
self.tone.compute(experiment=self.experiment)

def test_multi_objective_partial_no_effects(self) -> None:
# GIVEN we have a multi-objective experiment with one metric with no effects
self.moo_experiment.attach_data(
Expand Down
58 changes: 57 additions & 1 deletion ax/utils/stats/no_effects.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,26 @@
import pandas as pd
import scipy
from ax.core.data import Data
from ax.exceptions.core import UserInputError
from ax.utils.stats.math_utils import relativize
from scipy.stats import norm


def _validate_sample_sizes(df: pd.DataFrame) -> None:
"""Ensure per-arm sample sizes are available for the test of no effect.

The ``n`` column is optional on Ax ``Data`` and is not produced by
standard trial evaluation (e.g. ``Client.complete_trial``), so it must
be validated before use.
"""
if "n" not in df.columns or df["n"].isnull().any():
raise UserInputError(
"The test of no effect requires per-arm sample sizes: every row "
"of the data must have a value in the `n` column. Attach data "
"that includes sample sizes to use this test."
)


def check_experiment_effects_per_metric(
data: Data,
objective_names: set[str] | None = None,
Expand Down Expand Up @@ -44,6 +60,7 @@ def check_experiment_effects_per_metric(

"""
df = data.df
_validate_sample_sizes(df)
df_grouped = df.groupby(["metric_name", "trial_index"])
cols = ["metric_name", "trial_index", "p_value", "has_effect"]

Expand All @@ -56,6 +73,10 @@ def check_experiment_effects_per_metric(
for metric_name, trial_index in df_grouped.groups.keys():
dfm = df_grouped.get_group((metric_name, trial_index))

if len(dfm) < 2:
# A single-arm group cannot show within-trial effects.
continue

p_value, f_stat = no_effect_test_welch(
means=list(dfm["mean"].values),
sems=list(dfm["sem"].values),
Expand Down Expand Up @@ -112,11 +133,12 @@ def check_experiment_effects(
Returns:
effective: True if the null of no treatment effects can be rejected.
ineffective_on_objectives: List of objectives on which the
null of no treatment effects can be rejected.
null of no treatment effects could not be rejected.
bounds_df: The minimum and maximum bounds on possible effects.

"""
df = data.df
_validate_sample_sizes(df)
df_grouped = df.groupby("metric_name")
K = len(df_grouped)
ps = []
Expand Down Expand Up @@ -228,14 +250,48 @@ def no_effect_test_welch(
Returns: A tuple containing
- the p-value and
- the test-statistic value.

Raises:
UserInputError: If there are fewer than two arms, an arm has one or
fewer observations, or a mix of zero and positive sems makes the
test undefined.
"""
means_arr = np.array(means)
sems_arr = np.array(sems)
ns_arr = np.array(ns)

K = len(means_arr)

if K < 2:
raise UserInputError(
f"Welch's test of no effect requires at least two arms, received {K}."
)
if np.any(ns_arr <= 1):
raise UserInputError(
"Welch's test of no effect requires more than one observation "
"(n > 1) in each arm."
)

variances = np.multiply(sems_arr**2, ns_arr)

zero_variance = variances <= 0
if np.any(zero_variance):
# The mean of an arm with sem == 0 is known exactly, so Welch's F
# statistic is undefined (its weights divide by the variance).
# Previously this silently produced a NaN p-value, which read as
# "no effect" even when the exactly-known means clearly differed.
if np.unique(means_arr[zero_variance]).size > 1:
# Two exactly-known means differ: an exact effect.
return 0.0, float("inf")
if np.all(zero_variance):
# All means are known exactly and are identical: no effect.
return 1.0, 0.0
raise UserInputError(
"Welch's test of no effect requires a positive sem for each "
"arm; found a mix of zero and positive sems, for which the "
"test is undefined."
)

ws = np.divide(ns_arr, variances)

W = np.sum(ws)
Expand Down
148 changes: 148 additions & 0 deletions ax/utils/stats/tests/test_no_effects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

import pandas as pd
from ax.core.data import Data
from ax.exceptions.core import UserInputError
from ax.utils.common.testutils import TestCase
from ax.utils.stats.no_effects import (
check_experiment_effects,
check_experiment_effects_per_metric,
no_effect_test_welch,
)


def _data(rows: list[dict[str, object]]) -> Data:
base = {"trial_index": 0, "metric_name": "m1", "metric_signature": "m1"}
return Data(df=pd.DataFrame([{**base, **row} for row in rows]))


class TestNoEffects(TestCase):
def test_effects_detected(self) -> None:
# GIVEN two arms with clearly different means
data = _data(
[
{"arm_name": "0_0", "mean": 1.0, "sem": 0.01, "n": 1000},
{"arm_name": "0_1", "mean": 2.0, "sem": 0.01, "n": 1000},
]
)
# WHEN we run the test of no effect
df_tone = check_experiment_effects_per_metric(data=data, objective_names={"m1"})
# THEN an effect is detected
self.assertTrue(df_tone["has_effect"].all())

def test_missing_sample_sizes(self) -> None:
# GIVEN data without the optional `n` column (as produced by standard
# trial evaluation, e.g. `Client.complete_trial`)
data = _data(
[
{"arm_name": "0_0", "mean": 1.0, "sem": 0.1},
{"arm_name": "0_1", "mean": 2.0, "sem": 0.1},
]
)
# WHEN we run the test of no effect
# THEN it raises a UserInputError instead of a KeyError
with self.assertRaisesRegex(UserInputError, "per-arm sample sizes"):
check_experiment_effects_per_metric(data=data, objective_names={"m1"})

def test_check_experiment_effects_missing_sample_sizes(self) -> None:
# GIVEN data without the optional `n` column
data = _data(
[
{"arm_name": "status_quo", "mean": 1.0, "sem": 0.1},
{"arm_name": "0_0", "mean": 2.0, "sem": 0.1},
]
)
# WHEN we run the overall (across-metric) test of no effect
# THEN it raises a UserInputError instead of a KeyError
with self.assertRaisesRegex(UserInputError, "per-arm sample sizes"):
check_experiment_effects(data=data, objective_names={"m1"})

def test_null_sample_sizes(self) -> None:
# GIVEN data where some rows are missing a sample size
data = _data(
[
{"arm_name": "0_0", "mean": 1.0, "sem": 0.1, "n": 1000},
{"arm_name": "0_1", "mean": 2.0, "sem": 0.1, "n": None},
]
)
# WHEN we run the test of no effect
# THEN it raises a UserInputError
with self.assertRaisesRegex(UserInputError, "per-arm sample sizes"):
check_experiment_effects_per_metric(data=data, objective_names={"m1"})

def test_zero_sem_with_different_means(self) -> None:
# GIVEN deterministic data (sem == 0) with clearly different means
data = _data(
[
{"arm_name": "0_0", "mean": 1.0, "sem": 0.0, "n": 1000},
{"arm_name": "0_1", "mean": 2.0, "sem": 0.0, "n": 1000},
]
)
# WHEN we run the test of no effect
df_tone = check_experiment_effects_per_metric(data=data, objective_names={"m1"})
# THEN the exact effect is detected (previously a NaN p-value silently
# read as "no effect")
self.assertEqual(df_tone["p_value"].item(), 0.0)
self.assertTrue(df_tone["has_effect"].item())

def test_zero_sem_with_equal_means(self) -> None:
# GIVEN deterministic data (sem == 0) with identical means
data = _data(
[
{"arm_name": "0_0", "mean": 1.0, "sem": 0.0, "n": 1000},
{"arm_name": "0_1", "mean": 1.0, "sem": 0.0, "n": 1000},
]
)
# WHEN we run the test of no effect
df_tone = check_experiment_effects_per_metric(data=data, objective_names={"m1"})
# THEN no effect is detected, with a well-defined p-value
self.assertEqual(df_tone["p_value"].item(), 1.0)
self.assertFalse(df_tone["has_effect"].item())

def test_mixed_zero_and_positive_sems(self) -> None:
# GIVEN one arm with sem == 0 and another with a positive sem
# WHEN we run Welch's test directly
# THEN it raises a UserInputError since the test is undefined
with self.assertRaisesRegex(UserInputError, "positive sem"):
no_effect_test_welch(means=[1.0, 1.0], sems=[0.0, 0.1], ns=[1000, 1000])

def test_single_arm_groups_are_skipped(self) -> None:
# GIVEN one single-arm trial and one two-arm trial
data = _data(
[
{"arm_name": "0_0", "mean": 1.0, "sem": 0.1, "n": 1000},
{
"arm_name": "1_0",
"mean": 1.0,
"sem": 0.1,
"n": 1000,
"trial_index": 1,
},
{
"arm_name": "1_1",
"mean": 2.0,
"sem": 0.1,
"n": 1000,
"trial_index": 1,
},
]
)
# WHEN we run the test of no effect
df_tone = check_experiment_effects_per_metric(data=data, objective_names={"m1"})
# THEN the single-arm trial is skipped (previously it produced a NaN
# p-value) and the two-arm trial is tested
self.assertEqual(df_tone["trial_index"].tolist(), [1])
self.assertTrue(df_tone["has_effect"].item())

def test_welch_requires_at_least_two_arms(self) -> None:
with self.assertRaisesRegex(UserInputError, "at least two arms"):
no_effect_test_welch(means=[1.0], sems=[0.1], ns=[1000])

def test_welch_requires_more_than_one_observation(self) -> None:
with self.assertRaisesRegex(UserInputError, "more than one observation"):
no_effect_test_welch(means=[1.0, 2.0], sems=[0.1, 0.1], ns=[1, 1000])