Source code for causalpy.checks.outcome_falsification

#   Copyright 2022 - 2026 The PyMC Labs Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""
Outcome falsification sensitivity check.

Re-fits the same causal method with alternative outcome formulas and
reports their estimated effect sizes.  The researcher interprets
whether the pattern of effects across outcomes is consistent with
their causal story.

Inspired by the "causal detective" approach in Gallea (2026),
*The Causal Mindset Handbook*, where Hanlon's London fog study
tests the pollution hypothesis by comparing effect sizes across
outcomes (pneumonia deaths vs. accident deaths vs. crime deaths).

Supports formula-based experiments: InterruptedTimeSeries,
DifferenceInDifferences, PiecewiseITS.
"""

from __future__ import annotations

import logging
import warnings
from dataclasses import dataclass
from typing import Any

import pandas as pd
from patsy import PatsyError

from causalpy.checks.base import CheckResult, clone_model
from causalpy.custom_exceptions import DataException, FormulaException
from causalpy.experiments.base import BaseExperiment
from causalpy.experiments.diff_in_diff import DifferenceInDifferences
from causalpy.experiments.interrupted_time_series import InterruptedTimeSeries
from causalpy.experiments.piecewise_its import PiecewiseITS
from causalpy.pipeline import PipelineContext
from causalpy.pymc_models import PyMCModel

logger = logging.getLogger(__name__)

_STORE_EXPERIMENTS_WARN_THRESHOLD = 3


[docs] @dataclass class FalsificationResult: """Result for a single falsification formula. Attributes ---------- formula : str The falsification formula used. effect_mean : float Posterior mean of the estimated effect. hdi_lower : float Lower bound of the HDI for the effect. hdi_upper : float Upper bound of the HDI for the effect. experiment : BaseExperiment | None The fitted experiment for this formula. ``None`` when ``OutcomeFalsification`` was run with ``store_experiments=False``; in that case only the summary statistics above are retained. """ formula: str effect_mean: float hdi_lower: float hdi_upper: float experiment: BaseExperiment | None = None
[docs] class OutcomeFalsification: """Outcome falsification sensitivity check. Re-fits the experiment with alternative formulas and reports the estimated effect size for each. This is an informational check -- it does not make pass/fail judgments. The researcher compares effect sizes across outcomes to assess whether the pattern is consistent with their causal story. Parameters ---------- formulas : list[str] Falsification formulas. Each must be a complete patsy formula (e.g., ``"beer ~ 1 + year"``). The outcome variable must exist in the data. alpha : float, default 0.05 Significance level. The HDI probability is ``1 - alpha``. store_experiments : bool, default True If ``True`` (default), each ``FalsificationResult`` retains a reference to the fitted experiment (including its ``InferenceData``), which lets users inspect posteriors but can be memory-heavy for many formulas. Set to ``False`` to keep only the summary statistics (``effect_mean``, ``hdi_lower``, ``hdi_upper``). A one-off warning is emitted at :meth:`run` when ``store_experiments=True`` and at least ``3`` formulas are supplied, because the combined ``InferenceData`` footprint of several fitted experiments can easily reach hundreds of MB on larger datasets (e.g. :class:`PiecewiseITS`). Examples -------- >>> import causalpy as cp # doctest: +SKIP >>> check = cp.checks.OutcomeFalsification( # doctest: +SKIP ... formulas=["beer ~ 1 + year", "lnincome ~ 1 + year"], ... ) """ applicable_methods: set[type[BaseExperiment]] = { InterruptedTimeSeries, DifferenceInDifferences, PiecewiseITS, }
[docs] def __init__( self, formulas: list[str], alpha: float = 0.05, store_experiments: bool = True, ) -> None: if not formulas: raise ValueError("formulas must be a non-empty list of formula strings.") if not all(isinstance(f, str) for f in formulas): raise TypeError("Each formula must be a string.") if not 0 < alpha < 1: raise ValueError(f"alpha must be in (0, 1), got {alpha}") self.formulas = list(formulas) self.alpha = alpha self.store_experiments = store_experiments
[docs] def validate(self, experiment: BaseExperiment) -> None: """Verify the experiment is compatible with OutcomeFalsification. Parameters ---------- experiment : BaseExperiment Candidate experiment to validate. Raises ------ TypeError If the experiment does not use a formula, or uses a non-PyMC model. """ if not hasattr(experiment, "formula"): raise TypeError( f"{type(experiment).__name__} does not use a formula. " f"OutcomeFalsification requires formula-based experiments " f"(InterruptedTimeSeries, DifferenceInDifferences, PiecewiseITS)." ) if not isinstance(experiment.model, PyMCModel): raise TypeError( f"OutcomeFalsification requires a PyMC model for posterior " f"extraction, but got {type(experiment.model).__name__}. " f"Use a PyMC model (e.g. cp.pymc_models.LinearRegression)." )
@staticmethod def _build_experiment( context: PipelineContext, formula: str, ) -> BaseExperiment: """Create and fit an experiment with a different formula. Uses ``context.experiment_config`` to reconstruct the experiment with all original parameters except the formula. """ if context.experiment_config is None: raise RuntimeError( "No experiment_config in context. Use EstimateEffect " "before SensitivityAnalysis, or pass an explicit " "experiment_factory." ) config = context.experiment_config method = config["method"] kwargs = {k: v for k, v in config.items() if k != "method"} kwargs["formula"] = formula if "model" in kwargs and kwargs["model"] is not None: kwargs["model"] = clone_model(kwargs["model"]) return method(context.data, **kwargs) @staticmethod def _extract_effect_stats( experiment: BaseExperiment, alpha: float, ) -> dict[str, float]: """Extract mean and HDI from a fitted experiment. Uses the experiment's ``effect_summary()`` method which provides standardized output across experiment types. The returned table always contains ``hdi_lower`` and ``hdi_upper`` columns. Raises ------ RuntimeError If ``effect_summary()`` returns an empty table or one that does not expose the expected ``mean`` / ``hdi_lower`` / ``hdi_upper`` columns. This is a defensive guard against future schema changes in ``effect_summary``: the single-effect contract assumed here (``iloc[0]`` is *the* effect row) should fail loudly rather than silently returning the wrong row. """ summary = experiment.effect_summary( alpha=alpha, direction="two-sided", cumulative=False, relative=False ) table = summary.table required_columns = {"mean", "hdi_lower", "hdi_upper"} if table is None or len(table) == 0: raise RuntimeError( f"{type(experiment).__name__}.effect_summary() returned an " f"empty table; cannot extract falsification effect statistics." ) missing = required_columns - set(table.columns) if missing: raise RuntimeError( f"{type(experiment).__name__}.effect_summary() table is " f"missing required column(s) {sorted(missing)}; " f"OutcomeFalsification expects a single-effect summary " f"with columns {sorted(required_columns)}." ) row = table.iloc[0] return { "mean": float(row["mean"]), "hdi_lower": float(row["hdi_lower"]), "hdi_upper": float(row["hdi_upper"]), }
[docs] def run( self, experiment: BaseExperiment, context: PipelineContext, ) -> CheckResult: """Run outcome falsification analysis. For each falsification formula, fits the experiment with the alternative formula and reports the estimated effect size with HDI intervals. This is an informational check (``passed=None``). The researcher interprets whether the pattern of effect sizes across outcomes supports their causal story. Parameters ---------- experiment : BaseExperiment The fitted experiment to check. context : PipelineContext Pipeline context providing ``experiment_config`` for re-fits. Returns ------- CheckResult With ``passed=None`` (informational). The table contains effect sizes and HDI intervals for each falsification formula. """ self.validate(experiment) if ( self.store_experiments and len(self.formulas) >= _STORE_EXPERIMENTS_WARN_THRESHOLD ): warnings.warn( f"OutcomeFalsification will store {len(self.formulas)} fitted " f"experiments (each with its own InferenceData). The combined " f"footprint can reach hundreds of MB on large datasets or " f"models with many posterior samples. Pass " f"store_experiments=False if you only need the summary " f"statistics (effect_mean, hdi_lower, hdi_upper).", stacklevel=2, ) results: list[FalsificationResult] = [] rows: list[dict[str, Any]] = [] failed_formulas: list[str] = [] hdi_pct = int(round((1 - self.alpha) * 100)) for formula in self.formulas: logger.info( "OutcomeFalsification: fitting formula '%s'", formula, ) try: alt_experiment = self._build_experiment(context, formula) stats = self._extract_effect_stats(alt_experiment, self.alpha) fr = FalsificationResult( formula=formula, effect_mean=stats["mean"], hdi_lower=stats["hdi_lower"], hdi_upper=stats["hdi_upper"], experiment=alt_experiment if self.store_experiments else None, ) results.append(fr) rows.append( { "formula": formula, "effect_mean": stats["mean"], f"hdi_{hdi_pct}%_lower": stats["hdi_lower"], f"hdi_{hdi_pct}%_upper": stats["hdi_upper"], } ) except ( PatsyError, FormulaException, DataException, ValueError, KeyError, RuntimeError, ) as exc: logger.warning( "OutcomeFalsification: failed for formula '%s'", formula, exc_info=True, ) warnings.warn( f"OutcomeFalsification: formula {formula!r} failed to fit " f"({type(exc).__name__}: {exc}); skipping.", stacklevel=2, ) failed_formulas.append(formula) rows.append( { "formula": formula, "effect_mean": float("nan"), f"hdi_{hdi_pct}%_lower": float("nan"), f"hdi_{hdi_pct}%_upper": float("nan"), } ) table = pd.DataFrame(rows) if rows else None parts = [ f"Outcome falsification: {len(self.formulas)} formula(s), {hdi_pct}% HDI.", ] if failed_formulas: parts.append( f" {len(failed_formulas)} formula(s) failed to fit: {failed_formulas}" ) for fr in results: parts.append( f" {fr.formula}: " f"effect = {fr.effect_mean:.4f} " f"[{fr.hdi_lower:.4f}, {fr.hdi_upper:.4f}]" ) text = "\n".join(parts) return CheckResult( check_name="OutcomeFalsification", passed=None, table=table, text=text, metadata={ "falsification_results": results, "alpha": self.alpha, }, )
def __repr__(self) -> str: """Return a string representation, showing only non-default flags.""" parts = [f"formulas={self.formulas!r}"] if self.alpha != 0.05: parts.append(f"alpha={self.alpha}") if not self.store_experiments: parts.append("store_experiments=False") return f"OutcomeFalsification({', '.join(parts)})"