Module `pulearn.propensity.bootstrap`

Bootstrap confidence intervals and instability warnings for c estimators.

Functions

def bootstrap_propensity_confidence_interval(estimator, y, *, s_proba=None, X=None, n_resamples=200, confidence_level=0.95, random_state=None, std_threshold=0.05, cv_threshold=0.15, fold_spread_threshold=0.1, warn_on_instability=True)

Expand source code Browse git

def bootstrap_propensity_confidence_interval(
    estimator,
    y,
    *,
    s_proba=None,
    X=None,
    n_resamples=200,
    confidence_level=0.95,
    random_state=None,
    std_threshold=0.05,
    cv_threshold=0.15,
    fold_spread_threshold=0.1,
    warn_on_instability=True,
):
    """Estimate a propensity confidence interval with stratified bootstrap."""
    _validate_bootstrap_estimator(estimator)
    if n_resamples < 2:
        raise ValueError("n_resamples must be at least 2.")
    if not 0 < confidence_level < 1:
        raise ValueError("confidence_level must lie strictly in (0, 1).")
    if std_threshold < 0:
        raise ValueError("std_threshold must be non-negative.")
    if cv_threshold < 0:
        raise ValueError("cv_threshold must be non-negative.")
    if fold_spread_threshold < 0:
        raise ValueError("fold_spread_threshold must be non-negative.")
    if n_resamples < 30:
        warnings.warn(
            (
                "Bootstrap intervals with fewer than 30 resamples can be "
                "unstable."
            ),
            UserWarning,
            stacklevel=2,
        )

    labels = _normalize_propensity_labels(
        y,
        context="bootstrap {}".format(type(estimator).__name__),
    )
    if X is None and s_proba is None:
        raise ValueError("Bootstrap requires X or s_proba inputs.")
    X_arr = None
    if X is not None:
        X_arr = _validated_feature_matrix(
            X,
            labels,
            context="bootstrap {}".format(type(estimator).__name__),
        )
    s_proba_arr = None
    if s_proba is not None:
        s_proba_arr = _propensity_score_array(s_proba, y=labels)

    rng = check_random_state(random_state)
    bootstrap_estimates = []
    failures = 0
    for _ in range(n_resamples):
        sample_indices = _stratified_bootstrap_indices(labels, rng)
        bootstrap_estimator = clone(estimator)
        _seed_estimator_random_state(
            bootstrap_estimator,
            int(rng.randint(np.iinfo(np.int32).max)),
        )
        try:
            fitted = bootstrap_estimator.fit(
                labels[sample_indices],
                s_proba=(
                    None
                    if s_proba_arr is None
                    else s_proba_arr[sample_indices]
                ),
                X=None if X_arr is None else X_arr[sample_indices],
            )
        except ValueError:
            failures += 1
            continue
        result = getattr(fitted, "result_", None)
        if result is None:
            raise TypeError(
                "Bootstrap estimator {} must set result_ after fit().".format(
                    type(bootstrap_estimator).__name__
                )
            )
        bootstrap_estimates.append(
            _validate_bootstrap_result(result, bootstrap_estimator)
        )

    if not bootstrap_estimates:
        raise ValueError(
            "Bootstrap failed for every resample; could not estimate a "
            "confidence interval."
        )
    if failures:
        warnings.warn(
            (
                "Skipped {} bootstrap resamples that failed to fit cleanly."
            ).format(failures),
            UserWarning,
            stacklevel=2,
        )

    estimates = np.asarray(bootstrap_estimates, dtype=float)
    warning_flags = list(
        _stability_warning_flags(
            estimates,
            estimator=estimator,
            failures=failures,
            n_resamples=n_resamples,
            std_threshold=std_threshold,
            cv_threshold=cv_threshold,
            fold_spread_threshold=fold_spread_threshold,
        )
    )
    if warn_on_instability and warning_flags:
        warnings.warn(
            ("Propensity bootstrap for {} indicates instability: {}.").format(
                type(estimator).__name__, ", ".join(warning_flags)
            ),
            UserWarning,
            stacklevel=2,
        )

    alpha = 0.5 * (1.0 - confidence_level)
    lower, upper = np.quantile(estimates, [alpha, 1.0 - alpha])
    return PropensityConfidenceInterval(
        lower=float(lower),
        upper=float(upper),
        confidence_level=float(confidence_level),
        n_resamples=int(n_resamples),
        successful_resamples=int(estimates.shape[0]),
        random_state=_serialize_random_state(random_state),
        mean=float(np.mean(estimates)),
        std=float(np.std(estimates, ddof=0)),
        warning_flags=tuple(warning_flags),
    )

Estimate a propensity confidence interval with stratified bootstrap.

Classes

class PropensityConfidenceInterval (lower: float, upper: float, confidence_level: float, n_resamples: int, successful_resamples: int, random_state: int | None, mean: float, std: float, warning_flags: tuple[str, ...] = <factory>)

Expand source code Browse git

@dataclass(frozen=True)
class PropensityConfidenceInterval:
    """Bootstrap confidence interval for a propensity estimate."""

    lower: float
    upper: float
    confidence_level: float
    n_resamples: int
    successful_resamples: int
    random_state: int | None
    mean: float
    std: float
    warning_flags: tuple[str, ...] = field(default_factory=tuple)

    def as_dict(self):
        """Return a machine-readable interval summary."""
        return {
            "lower": self.lower,
            "upper": self.upper,
            "confidence_level": self.confidence_level,
            "n_resamples": self.n_resamples,
            "successful_resamples": self.successful_resamples,
            "random_state": self.random_state,
            "mean": self.mean,
            "std": self.std,
            "warning_flags": list(self.warning_flags),
        }

Bootstrap confidence interval for a propensity estimate.

Instance variables

var confidence_level : float: The type of the None singleton.
var lower : float: The type of the None singleton.
var mean : float: The type of the None singleton.
var n_resamples : int: The type of the None singleton.
var random_state : int | None: The type of the None singleton.
var std : float: The type of the None singleton.
var successful_resamples : int: The type of the None singleton.
var upper : float: The type of the None singleton.
var warning_flags : tuple[str, ...]: The type of the None singleton.

Methods

def as_dict(self)

Expand source code Browse git

def as_dict(self):
    """Return a machine-readable interval summary."""
    return {
        "lower": self.lower,
        "upper": self.upper,
        "confidence_level": self.confidence_level,
        "n_resamples": self.n_resamples,
        "successful_resamples": self.successful_resamples,
        "random_state": self.random_state,
        "mean": self.mean,
        "std": self.std,
        "warning_flags": list(self.warning_flags),
    }

Return a machine-readable interval summary.