Source code for hepstats.hypotests.calculators.frequentist_calculator

from __future__ import annotations

from collections.abc import Callable

import numpy as np
from scipy.stats import norm

from ...utils import base_sample, base_sampler
from ..parameters import POI, POIarray
from .basecalculator import ToysCalculator



[docs]
class FrequentistCalculator(ToysCalculator):
    """Frequentist calculator class."""

    def __init__(
        self,
        input,
        minimizer,
        ntoysnull: int = 100,
        ntoysalt: int = 100,
        sampler: Callable = base_sampler,
        sample: Callable = base_sample,
    ):
        """
        Args:
            input: loss or fit result
            minimizer: minimizer to use to find the minimum of the loss function.
            ntoysnull: minimum number of toys to generate for the null hypothesis.
            ntoysalt: minimum number of toys to generate for the alternative hypothesis.
            sampler: function used to create sampler with models, number of events and floating parameters
              in the sample. Default is :func:`hepstats.utils.fit.sampling.base_sampler`.
            sample: function used to get samples from the sampler. Default is
              :func:`hepstats.utils.fit.sampling.base_sample`.

        Example with **zfit**:
            >>> import zfit
            >>> from zfit.core.loss import UnbinnedNLL
            >>> from zfit.minimize import MinuitMinimizer
            >>>
            >>> obs = zfit.Space('x', limits=(0.1, 2.0))
            >>> data = zfit.data.Data.from_numpy(obs=obs, array=np.random.normal(1.2, 0.1, 10000))
            >>> mean = zfit.Parameter("mu", 1.2)
            >>> sigma = zfit.Parameter("sigma", 0.1)
            >>> model = zfit.pdf.Gauss(obs=obs, mu=mean, sigma=sigma)
            >>> loss = UnbinnedNLL(model=[model], data=[data], fit_range=[obs])
            >>>
            >>> calc = FrequentistCalculator(input=loss, minimizer=MinuitMinimizer(), ntoysnull=1000, ntoysalt=1000)
        """
        super().__init__(
            input=input,
            minimizer=minimizer,
            ntoysnull=ntoysnull,
            ntoysalt=ntoysalt,
            sampler=sampler,
            sample=sample,
        )


[docs]
    def qnull(
        self,
        poinull: POI | POIarray,
        poialt: POI | None = None,
        onesided: bool = True,
        onesideddiscovery: bool = False,
        qtilde: bool = False,
    ):
        """Computes null hypothesis values of the :math:`\\Delta` log-likelihood test statistic.

        Args:
            poinull: parameters of interest for the null hypothesis.
            poialt: parameters of interest for the alternative hypothesis.
            onesided: if `True` computes onesided pvalues.
            onesideddiscovery: if `True` computes onesided pvalues for a discovery test.
            qtilde: if `True` use the :math:`\\widetilde{q}` test statistics else use the :math:`q`
              test statistic.

        Returns:
            Q distribution for the null hypothesis.

        Example with **zfit**:
            >>> mean = zfit.Parameter("mu", 1.2)
            >>> poinull = POIarray(mean, [1.1, 1.2, 1.0])
            >>> poialt = POI(mean, 1.2)
            >>> q = calc.qnull(poinull, poialt)
        """
        toysresults = self.get_toys_null(poinull, poinull, qtilde)
        ret = {}

        for p in poinull:
            toysresult = toysresults[p]
            nll1 = toysresult.nlls[p]
            nll2 = toysresult.nll_bestfit
            bestfit = toysresult.bestfit

            if qtilde:
                nllat0 = toysresult.nlls[POI(poinull.parameter, 0.0)]
                nll2 = np.where(bestfit < 0, nllat0, nll2)
                bestfit = np.where(bestfit < 0, 0, bestfit)

            poi1 = POIarray(poinull.parameter, np.full(nll1.size, p.value))
            poi2 = POIarray(poinull.parameter, bestfit)

            q_p = self.q(
                nll1=nll1,
                nll2=nll2,
                poi1=poi1,
                poi2=poi2,
                onesided=onesided,
                onesideddiscovery=onesideddiscovery,
            )

            ret[p] = q_p[~np.isnan(q_p)]

        return ret



[docs]
    def qalt(
        self,
        poinull: POI | POIarray,
        poialt: POI,
        onesided: bool = True,
        onesideddiscovery: bool = False,
        qtilde: bool = False,
    ):
        """Computes alternative hypothesis values of the :math:`\\Delta` log-likelihood test statistic.

        Args:
            poinull: parameters of interest for the null hypothesis.
            poialt: parameters of interest for the alternative hypothesis.
            onesided: if `True` computes onesided pvalues.
            onesideddiscovery: if `True` computes onesided pvalues for a discovery test.
            qtilde: if `True` use the :math:`\\widetilde{q}` test statistics else use the :math:`q`
              test statistic.

        Returns:
            Q distribution for the alternative hypothesis.

        Example with **zfit**:
            >>> mean = zfit.Parameter("mu", 1.2)
            >>> poinull = POIarray(mean, [1.1, 1.2, 1.0])
            >>> poialt = POI(mean, 1.2)
            >>> q = calc.qalt(poinull, poialt)
        """
        toysresult = self.get_toys_alt(poialt, poinull, qtilde)[poialt]
        ret = {}

        for p in poinull:
            nll1 = toysresult.nlls[p]
            nll2 = toysresult.nll_bestfit
            bestfit = toysresult.bestfit

            if qtilde:
                nllat0 = toysresult.nlls[POI(poinull.parameter, 0.0)]
                nll2 = np.where(bestfit < 0, nllat0, nll2)
                bestfit = np.where(bestfit < 0, 0, bestfit)

            poi1 = POIarray(poialt.parameter, np.full(nll1.size, p.value))
            poi2 = POIarray(poialt.parameter, bestfit)

            q_p = self.q(
                nll1=nll1,
                nll2=nll2,
                poi1=poi1,
                poi2=poi2,
                onesided=onesided,
                onesideddiscovery=onesideddiscovery,
            )

            ret[p] = q_p[~np.isnan(q_p)]

        return ret


    def _pvalue_(self, poinull, poialt, qtilde, onesided, onesideddiscovery):
        qobs = self.qobs(
            poinull,
            onesided=onesided,
            qtilde=qtilde,
            onesideddiscovery=onesideddiscovery,
        )

        def compute_pvalue(qdist, qobs):
            qdist = qdist[~(np.isnan(qdist) | np.isinf(qdist))]
            return len(qdist[qdist >= qobs]) / len(qdist)

        qnulldist = self.qnull(
            poinull=poinull,
            poialt=poialt,
            onesided=onesided,
            onesideddiscovery=onesideddiscovery,
            qtilde=qtilde,
        )
        pnull = np.empty(len(poinull))
        for i, p in enumerate(poinull):
            pnull[i] = compute_pvalue(qnulldist[p], qobs[i])

        if poialt is not None:
            qaltdist = self.qalt(
                poinull=poinull,
                poialt=poialt,
                onesided=onesided,
                onesideddiscovery=onesideddiscovery,
                qtilde=qtilde,
            )
            palt = np.empty(len(poinull))
            for i, p in enumerate(poinull):
                palt[i] = compute_pvalue(qaltdist[p], qobs[i])
        else:
            palt = None

        return pnull, palt

    def _expected_pvalue_(self, poinull, poialt, nsigma, CLs, onesided, onesideddiscovery, qtilde):
        ps = {ns: {"p_clsb": np.empty(len(poinull)), "p_clb": np.empty(len(poinull))} for ns in nsigma}

        qnulldist = self.qnull(
            poinull=poinull,
            poialt=poialt,
            onesided=onesided,
            onesideddiscovery=onesideddiscovery,
            qtilde=qtilde,
        )
        qaltdist = self.qalt(
            poinull=poinull,
            poialt=poialt,
            onesided=onesided,
            onesideddiscovery=onesideddiscovery,
            qtilde=qtilde,
        )

        def filter_nan(q):
            return q[~(np.isnan(q) | np.isinf(q))]

        for i, p in enumerate(poinull):
            qaltdist_p = filter_nan(qaltdist[p])
            lqaltdist = len(qaltdist_p)

            qnulldist_p = filter_nan(qnulldist[p])
            lqnulldist = len(qnulldist_p)

            p_clsb_i = np.empty(lqaltdist)
            p_clb_i = np.empty(lqaltdist)

            for j, q in np.ndenumerate(qaltdist_p):
                p_clsb_i[j] = len(qnulldist_p[qnulldist_p >= q]) / lqnulldist
                p_clb_i[j] = len(qaltdist_p[qaltdist_p >= q]) / lqaltdist

            for ns in nsigma:
                frac = norm.cdf(ns) * 100
                ps[ns]["p_clsb"][i] = np.percentile(p_clsb_i, frac)
                ps[ns]["p_clb"][i] = np.percentile(p_clb_i, frac)

        expected_pvalues = []
        for ns in nsigma:
            if CLs:
                p_cls = ps[ns]["p_clsb"] / ps[ns]["p_clb"]
                expected_pvalues.append(np.where(p_cls < 0, 0, p_cls))
            else:
                p_clsb = ps[ns]["p_clsb"]
                expected_pvalues.append(np.where(p_clsb < 0, 0, p_clsb))

        return expected_pvalues