Source code for multiple_inference.utils

"""Conditional inference utilities.
"""
from typing import Any, Optional, Sequence, Union

import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal, wasserstein_distance
from statsmodels.base.model import LikelihoodModelResults

Numeric1DArray = Sequence[float]


def _get_sample_weight(sample_weight: Optional[np.ndarray], shape: int) -> np.ndarray:
    if sample_weight is None:
        sample_weight = np.ones(shape)
    sample_weight = np.array(sample_weight)
    return sample_weight / sample_weight.sum()



[docs]
def expected_wasserstein_distance(
    mean: Numeric1DArray,
    cov: np.ndarray,
    estimated_means: np.ndarray,
    sample_weight: np.ndarray = None,
    **kwargs: Any
) -> float:
    """Compute the expected Wasserstein distance.

    This loss function computes the Wasserstein distance between the observed means
    ``mean`` and the distribution of means you would expect to observe given the
    estimated population means ``estimated_means``.

    Args:
        mean (Numeric1DArray): (n,) array of conventional point estimates.
        cov (np.ndarray): (n, n) covariance matrix of conventional estimates.
        estimated_means (np.ndarray): (# samples, n) matrix of draws from
            a distribution of population means.
        sample_weight (np.ndarray, optional): (# samples,) array of sample weights for
            ``estimated_means``. Defaults to None.
        **kwargs (Any): Keyword arguments for ``scipy.stats.wasserstein_distance``.

    Returns:
        float: Loss.
    """
    sample_weight = _get_sample_weight(sample_weight, estimated_means.shape[0])
    compute_distance = lambda mu: wasserstein_distance(
        multivariate_normal.rvs(mu, cov), mean, **kwargs
    )
    distances = np.apply_along_axis(compute_distance, 1, estimated_means)
    return (sample_weight * distances).sum()




[docs]
def holm_bonferroni_correction(
    filename: str = None, results: LikelihoodModelResults = None, alpha: float = 0.05
) -> pd.Series:
    """Get significant coefficients by performing a Holm-Bonferroni correction.

    Args:
        filename (str, optional): Name of the csv file with conventional estimates.
            Defaults to None.
        results (LikelihoodModelResults, optional): Results. Defaults to None.
        alpha (float, optional): Significance level. Defaults to .05.

    Raises:
        ValueError: You must specify either ``filename`` or ``results`` but not both.

    Returns:
        pd.DataFrame: Dataframe indicating which coefficients are significant.
    """
    if filename is None and results is None:
        raise ValueError("filename or results must be specified.")

    if filename is not None and results is not None:
        raise ValueError("Please specify either filename or results; not both.")

    if results is None:
        from .bayes import Improper

        results = Improper.from_csv(filename).fit()
        # Improper gives pvalues for 1-tailed tests
        pvalues = np.min(
            np.array([2 * results.pvalues, 2 * (1 - results.pvalues)]), axis=0
        )
    else:
        pvalues = results.pvalues

    argsort = pvalues.argsort()
    df = pd.DataFrame(
        {"pvalues": pvalues[argsort]},
        index=np.array(results.model.exog_names)[argsort],
    )
    index = np.where(df.pvalues > alpha / (len(df) - np.arange(len(df))))[0][0]
    df["significant"] = np.arange(len(df)) < index
    return df




[docs]
def weighted_quantile(
    values: np.ndarray,
    quantiles: Union[float, Numeric1DArray],
    sample_weight: np.ndarray = None,
    values_sorted: bool = False,
) -> np.ndarray:
    """Compute weighted quantiles.

    Args:
        values (np.ndarray): (n,) array over which to compute quantiles.
        quantiles (Union[float, Numeric1DArray]): (k,) array of quantiles of interest.
        sample_weight (np.ndarray, optional): (n,) array of sample weights. Defaults to
            None.
        values_sorted (bool, optional): Indicates that ``values`` have been pre-sorted.
            Defaults to False.

    Returns:
        np.array: (k,) array of weighted quantiles.

    Acknowledgements:
        Credit to `Stackoverflow <https://stackoverflow.com/a/29677616/10676300>`_.
    """
    values = np.array(values)
    quantiles = np.atleast_1d(quantiles)  # type: ignore
    sample_weight = _get_sample_weight(sample_weight, len(values))
    assert np.all(quantiles >= 0) and np.all(  # type: ignore
        quantiles <= 1  # type: ignore
    ), "quantiles should be in [0, 1]"

    if not values_sorted:
        sorter = np.argsort(values)
        values = values[sorter]
        sample_weight = sample_weight[sorter]

    weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight  # type: ignore
    return np.interp(quantiles, weighted_quantiles, values)