Source code for veldist.analysis

"""
Statistical analysis utilities for inferred velocity distributions.
"""

import numpy as np

__all__ = [
    "compute_moments",
    "cdf_percentile",
    "tail_weight",
    "bimodality_score",
    "half_68ci",
    "compute_summary",
    "compute_summary_maps",
]

# ---------------------------------------------------------------------------
# Legacy API (kept for backward compatibility)
# ---------------------------------------------------------------------------



[docs]
def compute_moments(pdf_samples, grid_centers):
    """
    Compute statistical moments from posterior LOSVD samples.

    .. deprecated::
        This function is retained for backward compatibility.  For new code,
        prefer :func:`compute_summary`, which returns the same quantities plus
        robust alternatives (median, IQR, tail weight, bimodality score) and
        uses the posterior half-68% CI as its uncertainty convention instead
        of the posterior standard deviation.

    Parameters
    ----------
    pdf_samples : array-like, shape (n_samples, n_bins)
        MCMC samples of the probability mass function.  Each row must sum to 1.
    grid_centers : array-like, shape (n_bins,)
        Centres of the velocity bins (km/s or consistent velocity unit).

    Returns
    -------
    dict
        Keys and values:

        ``'mean'`` : (float, float)
            ``(posterior_mean, posterior_std)`` of the flux-weighted mean
            velocity across MCMC samples.
        ``'std'`` : (float, float)
            ``(posterior_mean, posterior_std)`` of the velocity dispersion.
        ``'skewness'`` : (float, float)
            ``(posterior_mean, posterior_std)`` of the normalised third
            central moment.
        ``'kurtosis'`` : (float, float)
            ``(posterior_mean, posterior_std)`` of the excess kurtosis
            (fourth central moment / σ⁴ − 3).
    """
    pdf_samples = np.asarray(pdf_samples)  # (n_samples, n_bins)
    grid_centers = np.asarray(grid_centers)  # (n_bins,)

    # Mean velocity for each posterior sample: shape (n_samples,)
    means = pdf_samples @ grid_centers

    # Centred residuals: (n_samples, n_bins) - (n_samples, 1)
    delta = grid_centers[np.newaxis, :] - means[:, np.newaxis]

    # Variance and standard deviation via row-wise weighted dot product
    variance = np.einsum("ij,ij->i", pdf_samples, delta**2)
    stds = np.sqrt(variance)

    # Skewness and excess kurtosis; guard against zero-dispersion samples
    safe_stds = np.where(stds > 0, stds, 1.0)
    skews = np.einsum("ij,ij->i", pdf_samples, delta**3) / safe_stds**3
    skews = np.where(stds > 0, skews, 0.0)
    kurts = np.einsum("ij,ij->i", pdf_samples, delta**4) / safe_stds**4 - 3
    kurts = np.where(stds > 0, kurts, 0.0)

    return {
        "mean": (np.mean(means), np.std(means)),
        "std": (np.mean(stds), np.std(stds)),
        "skewness": (np.mean(skews), np.std(skews)),
        "kurtosis": (np.mean(kurts), np.std(kurts)),
    }



# ---------------------------------------------------------------------------
# Public helper functions
# ---------------------------------------------------------------------------



[docs]
def cdf_percentile(pdf_samples, grid_centers, p):
    """
    Interpolate velocity values at cumulative probability level(s) *p*.

    Builds the empirical CDF for each MCMC sample by cumulative summation of
    the probability mass, then uses linear interpolation to find the velocity
    at which the CDF equals *p*.  This is the building block for
    :func:`compute_summary`'s median and IQR calculations, but is also useful
    on its own for constructing credible bands on cumulative profiles.

    Parameters
    ----------
    pdf_samples : array-like, shape (n_samples, n_bins)
        MCMC samples of the probability mass function.  Each row must sum to 1
        and *grid_centers* must be monotonically increasing.
    grid_centers : array-like, shape (n_bins,)
        Bin-centre velocities in ascending order.
    p : float or array-like
        Cumulative probability level(s) in the interval [0, 1].  Scalar input
        returns a 1-D result; array input returns a 2-D result.

    Returns
    -------
    ndarray
        Shape ``(n_samples,)`` when *p* is a scalar, or
        ``(n_samples, len(p))`` when *p* is array-like.

    Examples
    --------
    Compute the posterior median velocity for each MCMC sample:

    >>> v_median_samples = cdf_percentile(pdf_samples, grid_centers, 0.5)

    Compute Q25 and Q75 simultaneously:

    >>> q25_q75 = cdf_percentile(pdf_samples, grid_centers, [0.25, 0.75])
    >>> iqr_samples = q25_q75[:, 1] - q25_q75[:, 0]
    """
    pdf_samples = np.asarray(pdf_samples, dtype=float)
    grid_centers = np.asarray(grid_centers, dtype=float)
    cdf = np.cumsum(pdf_samples, axis=1)  # (n_samples, n_bins)
    scalar_p = np.ndim(p) == 0
    p_arr = np.atleast_1d(np.asarray(p, dtype=float))
    # np.interp is not vectorised over the xp axis, so loop over samples.
    result = np.array(
        [np.interp(p_arr, cdf[s], grid_centers) for s in range(len(pdf_samples))]
    )  # (n_samples, len(p_arr))
    return result[:, 0] if scalar_p else result




[docs]
def tail_weight(pdf_samples, grid_centers, means, stds):
    """
    Fraction of probability mass outside ±1σ of the mean, per MCMC sample.

    This is a direct, model-free measure of tail heaviness that can be
    interpreted without any expansion assumption.  It is the non-parametric
    analogue of the Gauss-Hermite *h₄* coefficient, but more intuitive: for a
    Gaussian the value is exactly ``1 − erf(1/√2) ≈ 0.3173``.

    Parameters
    ----------
    pdf_samples : array-like, shape (n_samples, n_bins)
        MCMC samples of the probability mass function.  Each row sums to 1.
    grid_centers : array-like, shape (n_bins,)
        Bin-centre velocities.
    means : array-like, shape (n_samples,)
        Per-sample flux-weighted mean velocities.  Compute as
        ``pdf_samples @ grid_centers`` if not already available.
    stds : array-like, shape (n_samples,)
        Per-sample velocity dispersions (standard deviations of the LOSVD).

    Returns
    -------
    ndarray, shape (n_samples,)
        Tail-weight value for each MCMC sample.  Values greater than 0.317
        indicate heavier tails than Gaussian (associated with radial velocity
        anisotropy); values below 0.317 indicate lighter tails / flat top
        (tangential anisotropy).

    Notes
    -----
    The Gaussian reference value can be computed as::

        from math import erf, sqrt
        gaussian_tail_weight = 1 - erf(1 / sqrt(2))   # ≈ 0.3173

    Examples
    --------
    >>> means = pdf_samples @ grid_centers
    >>> stds  = np.sqrt(np.einsum('ij,ij->i', pdf_samples,
    ...                           (grid_centers - means[:, None])**2))
    >>> tw_samples = tail_weight(pdf_samples, grid_centers, means, stds)
    >>> print(f"tail weight = {np.median(tw_samples):.4f}")
    """
    pdf_samples = np.asarray(pdf_samples, dtype=float)
    grid_centers = np.asarray(grid_centers, dtype=float)
    means = np.asarray(means, dtype=float)
    stds = np.asarray(stds, dtype=float)
    delta = grid_centers[np.newaxis, :] - means[:, np.newaxis]  # (n_s, n_bins)
    outside = np.abs(delta) > stds[:, np.newaxis]  # bool (n_s, n_bins)
    return np.sum(pdf_samples * outside, axis=1)  # (n_samples,)




[docs]
def bimodality_score(pdf_samples):
    """
    Count the number of peaks in the smoothed posterior-mean LOSVD.

    This is a diagnostic integer flag, not a posterior quantity.  It is
    computed from the posterior-mean LOSVD (not from individual samples), so
    no credible interval is available.  Bins with a score of 2 or more
    indicate a potentially multimodal distribution where GH-analogue summary
    statistics (mean, sigma, skewness, kurtosis) may be misleading and the
    full histogram shape should be inspected.

    Peak detection uses a 3-point boxcar to smooth over single-bin noise, and
    requires each candidate peak to exceed 1% of the global maximum to
    suppress spurious detections in low-probability tails.

    Parameters
    ----------
    pdf_samples : array-like, shape (n_samples, n_bins)
        MCMC samples of the probability mass function.  Each row sums to 1.

    Returns
    -------
    int
        Number of local maxima detected.  Typical values:

        ``1``
            Unimodal distribution (normal case).
        ``2``
            Bimodal — could indicate counter-rotation, two kinematic
            components, or a contaminating population.
        ``≥ 3``
            Highly irregular; inspect visually before interpreting any
            scalar summaries.

    Notes
    -----
    A more principled treatment would count peaks on every individual MCMC
    sample and return a distribution over the count.  This is deferred as a
    future improvement; the posterior-mean approach is sufficient for
    identifying bins that require visual inspection.

    Examples
    --------
    >>> score = bimodality_score(solver.samples["intrinsic_pdf"])
    >>> if score >= 2:
    ...     print("Multimodal — inspect full histogram before trusting moments")
    """
    pdf_samples = np.asarray(pdf_samples, dtype=float)
    mean_pdf = np.mean(pdf_samples, axis=0)
    smoothed = np.convolve(mean_pdf, np.full(3, 1.0 / 3), mode="same")
    # Require each peak to exceed 1% of the global maximum to suppress
    # noise peaks in poorly constrained tail bins.
    min_height = 0.01 * smoothed.max()
    interior = smoothed[1:-1]
    left = smoothed[:-2]
    right = smoothed[2:]
    n_peaks = int(
        np.sum((interior > left) & (interior > right) & (interior > min_height))
    )
    return n_peaks




[docs]
def half_68ci(samples):
    """
    Half-width of the 68% posterior credible interval.

    Computes ``(p84 − p16) / 2`` — the symmetric ± uncertainty reported
    throughout veldist for both LOSVD values and scalar summary metrics.
    This matches the convention used by BayesLOSVD and is the natural
    non-parametric analogue of a 1-σ Gaussian error bar.

    Parameters
    ----------
    samples : array-like, shape (n_samples,)
        Posterior samples of a scalar quantity (e.g., one metric evaluated
        over all MCMC draws).

    Returns
    -------
    float
        ``(p84 − p16) / 2``, in the same units as *samples*.

    Notes
    -----
    For a Gaussian posterior this equals the posterior standard deviation.
    For skewed or heavy-tailed posteriors it can differ substantially, but
    it always has the interpretation "the true value lies within ±half_68ci
    of the median with approximately 68% posterior probability."

    Examples
    --------
    >>> v_mean_samples = pdf_samples @ grid_centers
    >>> uncertainty = half_68ci(v_mean_samples)
    >>> median = float(np.median(v_mean_samples))
    >>> print(f"v_mean = {median:.1f} ± {uncertainty:.1f} km/s")
    """
    samples = np.asarray(samples, dtype=float)
    p16, p84 = np.percentile(samples, [16, 84])
    return float((p84 - p16) / 2.0)



# ---------------------------------------------------------------------------
# Primary public API
# ---------------------------------------------------------------------------



[docs]
def compute_summary(pdf_samples, grid_centers):
    """
    Compute spatially-mappable scalar summaries from posterior LOSVD samples.

    This is the primary analysis function for extracting kinematic maps from
    :class:`~veldist.KinematicSolver` output.  All metrics except
    ``bimodality_score`` are evaluated independently on every MCMC sample,
    so the full posterior uncertainty — measurement noise, finite star count,
    and prior regularisation — is propagated automatically with no separate
    bootstrap or error-propagation step.

    Each metric is summarised as ``(posterior_median, half_68ci)`` using the
    same convention as the LOSVD itself: the reported uncertainty is
    ``(p84 − p16) / 2``.

    Parameters
    ----------
    pdf_samples : array-like, shape (n_samples, n_bins)
        MCMC samples of the probability mass function.  Each row must sum to 1.
    grid_centers : array-like, shape (n_bins,)
        Centres of the velocity bins (km/s or consistent velocity unit).

    Returns
    -------
    dict
        Each key maps to a ``(median, half_68ci)`` tuple of floats — both in
        the same units as *grid_centers* for velocity quantities, and
        dimensionless for shape metrics — **except** ``'bimodality_score'``,
        which is a plain ``int``.

        **Location**

        ``'v_mean'``
            Flux-weighted mean velocity.  GH analogue: *V*.  Sensitive to
            tail contamination; compare with ``v_median`` as a cross-check.
        ``'v_median'``
            Median velocity (CDF = 0.5).  Robust against edge-bin
            contamination and heavy tails.
        ``'v_asymmetry'``
            Mean minus median.  Near zero for symmetric LOSVDs; the sign
            mirrors that of the low-velocity tail (positive = mean pulled
            toward higher velocities by a trailing tail).  Closely related
            to *h₃*, but does not require computing higher-order moments.

        **Dispersion**

        ``'sigma'``
            Standard deviation of the LOSVD.  GH analogue: *σ*.
        ``'iqr'``
            Interquartile range Q75 − Q25.  Robust dispersion estimate
            insensitive to tail contamination.
        ``'sigma_iqr'``
            IQR / 1.3490 — the Gaussian-equivalent dispersion derived from
            the IQR.  For a Gaussian, ``sigma_iqr ≈ sigma``.
            ``sigma_iqr < sigma`` → heavy tails (radial anisotropy);
            ``sigma_iqr > sigma`` → flat top (tangential anisotropy).

        **Shape**

        ``'skewness'``
            Normalised third central moment *γ₁*.  Zero for symmetric
            distributions.  GH analogue: *h₃* ≈ −*γ₁* / √6.  Note sign:
            *γ₁* > 0 (right-skewed, trailing tail) → *h₃* < 0, the
            expected signal on the receding side of a rotating system.
        ``'kurtosis'``
            Excess kurtosis *κ* = fourth central moment / σ⁴ − 3.  Zero for
            a Gaussian.  GH analogue: *h₄* ≈ *κ* / √24.  Positive
            (leptokurtic) → radially anisotropic; negative (platykurtic) →
            tangentially anisotropic / flat-topped.
        ``'tail_weight'``
            Fraction of probability mass outside ±1*σ* of the mean.
            Gaussian reference: 0.3173.  A more direct anisotropy diagnostic
            than *h₄*: no expansion assumption, always interpretable even for
            non-Gaussian shapes.  See also :func:`tail_weight`.

        **Diagnostic**

        ``'bimodality_score'``
            Integer number of peaks in the smoothed posterior-mean LOSVD
            (see :func:`bimodality_score`).  Score 1 = unimodal; ≥ 2 =
            inspect visually.  No uncertainty is returned for this metric.

    Notes
    -----
    The approximate Gauss-Hermite conversions (valid for |h₃|, |h₄| ≲ 0.2):

    .. code-block:: text

        h3 ≈ -skewness / sqrt(6)
        h4 ≈  kurtosis / sqrt(24)

    These allow cross-validation against GH-based models and literature maps.

    For spatial bins where ``bimodality_score ≥ 2``, the mean, sigma,
    skewness, and kurtosis should be treated with caution: the mean lands
    between two peaks, sigma is inflated by their separation, and skewness
    reflects which peak is taller rather than any genuine asymmetry.

    Examples
    --------
    >>> solver = KinematicSolver(v_grid)
    >>> solver.add_data(velocities, uncertainties)
    >>> solver.run()
    >>> summary = compute_summary(solver.samples["intrinsic_pdf"],
    ...                           solver.grid["centers"])
    >>> v, dv = summary["v_mean"]
    >>> s, ds = summary["sigma"]
    >>> print(f"V = {v:.1f} ± {dv:.1f}  σ = {s:.1f} ± {ds:.1f}  km/s")
    """
    pdf_samples = np.asarray(pdf_samples, dtype=float)  # (n_samples, n_bins)
    grid_centers = np.asarray(grid_centers, dtype=float)  # (n_bins,)

    # ------------------------------------------------------------------
    # Moment-based quantities (fully vectorised)
    # ------------------------------------------------------------------
    means = pdf_samples @ grid_centers  # (n_samples,)
    delta = grid_centers[np.newaxis, :] - means[:, np.newaxis]  # (n_s, n_bins)

    variance = np.einsum("ij,ij->i", pdf_samples, delta**2)  # (n_samples,)
    stds = np.sqrt(variance)  # (n_samples,)
    safe_stds = np.where(stds > 0, stds, 1.0)

    skews = np.einsum("ij,ij->i", pdf_samples, delta**3) / safe_stds**3
    skews = np.where(stds > 0, skews, 0.0)

    kurts = (np.einsum("ij,ij->i", pdf_samples, delta**4) / safe_stds**4) - 3.0
    kurts = np.where(stds > 0, kurts, 0.0)

    tw = tail_weight(pdf_samples, grid_centers, means, stds)  # (n_samples,)

    # ------------------------------------------------------------------
    # CDF-based quantities  (loop over samples; fast for ~1 000 draws)
    # ------------------------------------------------------------------
    # Single call returns (n_samples, 3) for [Q25, Q50, Q75]
    pctls = cdf_percentile(pdf_samples, grid_centers, np.array([0.25, 0.50, 0.75]))
    q25, medians, q75 = pctls[:, 0], pctls[:, 1], pctls[:, 2]

    iqr = q75 - q25  # (n_samples,)
    sigma_iqr = iqr / 1.3490  # Gaussian-equivalent σ
    v_asym = means - medians  # (n_samples,)

    # ------------------------------------------------------------------
    # Bimodality score (scalar — from posterior mean, not per-sample)
    # ------------------------------------------------------------------
    bscore = bimodality_score(pdf_samples)

    # ------------------------------------------------------------------
    # Summarise each per-sample array as (median, half_68ci)
    # ------------------------------------------------------------------
    def _summarise(arr):
        p16, p50, p84 = np.percentile(arr, [16, 50, 84])
        return (float(p50), float((p84 - p16) / 2.0))

    return {
        # Location
        "v_mean": _summarise(means),
        "v_median": _summarise(medians),
        "v_asymmetry": _summarise(v_asym),
        # Dispersion
        "sigma": _summarise(stds),
        "iqr": _summarise(iqr),
        "sigma_iqr": _summarise(sigma_iqr),
        # Shape
        "skewness": _summarise(skews),
        "kurtosis": _summarise(kurts),
        "tail_weight": _summarise(tw),
        # Diagnostic (no uncertainty)
        "bimodality_score": bscore,
    }




[docs]
def compute_summary_maps(solvers):
    """
    Compute summary statistics for all solved bins from :func:`~veldist.fit_all_bins`.

    Iterates over a list of :class:`~veldist.KinematicSolver` instances,
    calls :func:`compute_summary` on each one, and assembles the results into
    arrays shaped ``(n_bins,)`` — one entry per spatial bin — ready to pass
    directly to a spatial map plotting function.

    Bins that were skipped during inference (``None`` entries in *solvers*, as
    returned for bins below ``min_stars`` by :func:`~veldist.fit_all_bins`)
    produce ``NaN`` in all output arrays so that the spatial indexing is
    preserved.

    Parameters
    ----------
    solvers : list of :class:`~veldist.KinematicSolver` or None
        As returned by :func:`~veldist.fit_all_bins`.  ``None`` entries
        (skipped bins) are silently mapped to ``NaN``.  The list must
        contain at least one non-``None`` entry.

    Returns
    -------
    dict
        One key per metric name from :func:`compute_summary`.  Each value is
        a sub-dict with two keys:

        ``'median'`` : ndarray, shape (n_bins,)
            Posterior median of the metric.  ``NaN`` for skipped bins.  For
            ``bimodality_score``, the integer score is cast to float.
        ``'uncertainty'`` : ndarray, shape (n_bins,)
            Half-width of the 68% credible interval.  ``NaN`` for skipped
            bins and for ``bimodality_score`` (which has no posterior CI).

    Raises
    ------
    ValueError
        If all entries in *solvers* are ``None``.

    Examples
    --------
    Build velocity and dispersion maps and plot them side by side:

    >>> maps = compute_summary_maps(solvers)
    >>> import matplotlib.pyplot as plt
    >>> fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    >>> for ax, key, label in zip(axes,
    ...                           ["v_mean", "sigma"],
    ...                           ["Mean velocity (km/s)", "Dispersion (km/s)"]):
    ...     sc = ax.scatter(xbin, ybin, c=maps[key]["median"], cmap="RdBu_r")
    ...     plt.colorbar(sc, ax=ax, label=label)

    Flag multimodal bins before plotting moments:

    >>> bscore = maps["bimodality_score"]["median"]
    >>> multimodal = bscore >= 2
    >>> print(f"{multimodal.sum():.0f} bins have bimodality_score ≥ 2")

    Notes
    -----
    The ``bimodality_score`` sub-dict has ``'median'`` filled with the
    integer score cast to float and ``'uncertainty'`` filled with ``NaN``.
    This keeps the return structure uniform so callers can iterate over all
    keys without special-casing the diagnostic metric.
    """
    n_bins = len(solvers)

    # Infer the full set of metric keys from the first solved bin.
    first = next((s for s in solvers if s is not None), None)
    if first is None:
        raise ValueError("All solvers are None — no bins to summarise.")

    ref_summary = compute_summary(first.samples["intrinsic_pdf"], first.grid["centers"])
    metric_keys = list(ref_summary.keys())

    # Initialise all arrays to NaN.
    maps = {
        k: {"median": np.full(n_bins, np.nan), "uncertainty": np.full(n_bins, np.nan)}
        for k in metric_keys
    }

    for i, solver in enumerate(solvers):
        if solver is None:
            continue

        summary = compute_summary(
            solver.samples["intrinsic_pdf"], solver.grid["centers"]
        )

        for k, v in summary.items():
            if k == "bimodality_score":
                maps[k]["median"][i] = float(v)
                # uncertainty stays NaN — bimodality_score has no posterior CI
            else:
                maps[k]["median"][i] = v[0]
                maps[k]["uncertainty"][i] = v[1]

    return maps