Source code for veldist.analysis

"""
Statistical analysis utilities for inferred velocity distributions.
"""

import numpy as np

__all__ = [
    "compute_moments",
    "cdf_percentile",
    "tail_weight",
    "bimodality_score",
    "half_68ci",
    "compute_summary",
    "compute_summary_maps",
]

# ---------------------------------------------------------------------------
# Legacy API (kept for backward compatibility)
# ---------------------------------------------------------------------------


[docs] def compute_moments(pdf_samples, grid_centers): """ Compute statistical moments from posterior LOSVD samples. .. deprecated:: This function is retained for backward compatibility. For new code, prefer :func:`compute_summary`, which returns the same quantities plus robust alternatives (median, IQR, tail weight, bimodality score) and uses the posterior half-68% CI as its uncertainty convention instead of the posterior standard deviation. Parameters ---------- pdf_samples : array-like, shape (n_samples, n_bins) MCMC samples of the probability mass function. Each row must sum to 1. grid_centers : array-like, shape (n_bins,) Centres of the velocity bins (km/s or consistent velocity unit). Returns ------- dict Keys and values: ``'mean'`` : (float, float) ``(posterior_mean, posterior_std)`` of the flux-weighted mean velocity across MCMC samples. ``'std'`` : (float, float) ``(posterior_mean, posterior_std)`` of the velocity dispersion. ``'skewness'`` : (float, float) ``(posterior_mean, posterior_std)`` of the normalised third central moment. ``'kurtosis'`` : (float, float) ``(posterior_mean, posterior_std)`` of the excess kurtosis (fourth central moment / σ⁴ − 3). """ pdf_samples = np.asarray(pdf_samples) # (n_samples, n_bins) grid_centers = np.asarray(grid_centers) # (n_bins,) # Mean velocity for each posterior sample: shape (n_samples,) means = pdf_samples @ grid_centers # Centred residuals: (n_samples, n_bins) - (n_samples, 1) delta = grid_centers[np.newaxis, :] - means[:, np.newaxis] # Variance and standard deviation via row-wise weighted dot product variance = np.einsum("ij,ij->i", pdf_samples, delta**2) stds = np.sqrt(variance) # Skewness and excess kurtosis; guard against zero-dispersion samples safe_stds = np.where(stds > 0, stds, 1.0) skews = np.einsum("ij,ij->i", pdf_samples, delta**3) / safe_stds**3 skews = np.where(stds > 0, skews, 0.0) kurts = np.einsum("ij,ij->i", pdf_samples, delta**4) / safe_stds**4 - 3 kurts = np.where(stds > 0, kurts, 0.0) return { "mean": (np.mean(means), np.std(means)), "std": (np.mean(stds), np.std(stds)), "skewness": (np.mean(skews), np.std(skews)), "kurtosis": (np.mean(kurts), np.std(kurts)), }
# --------------------------------------------------------------------------- # Public helper functions # ---------------------------------------------------------------------------
[docs] def cdf_percentile(pdf_samples, grid_centers, p): """ Interpolate velocity values at cumulative probability level(s) *p*. Builds the empirical CDF for each MCMC sample by cumulative summation of the probability mass, then uses linear interpolation to find the velocity at which the CDF equals *p*. This is the building block for :func:`compute_summary`'s median and IQR calculations, but is also useful on its own for constructing credible bands on cumulative profiles. Parameters ---------- pdf_samples : array-like, shape (n_samples, n_bins) MCMC samples of the probability mass function. Each row must sum to 1 and *grid_centers* must be monotonically increasing. grid_centers : array-like, shape (n_bins,) Bin-centre velocities in ascending order. p : float or array-like Cumulative probability level(s) in the interval [0, 1]. Scalar input returns a 1-D result; array input returns a 2-D result. Returns ------- ndarray Shape ``(n_samples,)`` when *p* is a scalar, or ``(n_samples, len(p))`` when *p* is array-like. Examples -------- Compute the posterior median velocity for each MCMC sample: >>> v_median_samples = cdf_percentile(pdf_samples, grid_centers, 0.5) Compute Q25 and Q75 simultaneously: >>> q25_q75 = cdf_percentile(pdf_samples, grid_centers, [0.25, 0.75]) >>> iqr_samples = q25_q75[:, 1] - q25_q75[:, 0] """ pdf_samples = np.asarray(pdf_samples, dtype=float) grid_centers = np.asarray(grid_centers, dtype=float) cdf = np.cumsum(pdf_samples, axis=1) # (n_samples, n_bins) scalar_p = np.ndim(p) == 0 p_arr = np.atleast_1d(np.asarray(p, dtype=float)) # np.interp is not vectorised over the xp axis, so loop over samples. result = np.array( [np.interp(p_arr, cdf[s], grid_centers) for s in range(len(pdf_samples))] ) # (n_samples, len(p_arr)) return result[:, 0] if scalar_p else result
[docs] def tail_weight(pdf_samples, grid_centers, means, stds): """ Fraction of probability mass outside ±1σ of the mean, per MCMC sample. This is a direct, model-free measure of tail heaviness that can be interpreted without any expansion assumption. It is the non-parametric analogue of the Gauss-Hermite *h₄* coefficient, but more intuitive: for a Gaussian the value is exactly ``1 − erf(1/√2) ≈ 0.3173``. Parameters ---------- pdf_samples : array-like, shape (n_samples, n_bins) MCMC samples of the probability mass function. Each row sums to 1. grid_centers : array-like, shape (n_bins,) Bin-centre velocities. means : array-like, shape (n_samples,) Per-sample flux-weighted mean velocities. Compute as ``pdf_samples @ grid_centers`` if not already available. stds : array-like, shape (n_samples,) Per-sample velocity dispersions (standard deviations of the LOSVD). Returns ------- ndarray, shape (n_samples,) Tail-weight value for each MCMC sample. Values greater than 0.317 indicate heavier tails than Gaussian (associated with radial velocity anisotropy); values below 0.317 indicate lighter tails / flat top (tangential anisotropy). Notes ----- The Gaussian reference value can be computed as:: from math import erf, sqrt gaussian_tail_weight = 1 - erf(1 / sqrt(2)) # ≈ 0.3173 Examples -------- >>> means = pdf_samples @ grid_centers >>> stds = np.sqrt(np.einsum('ij,ij->i', pdf_samples, ... (grid_centers - means[:, None])**2)) >>> tw_samples = tail_weight(pdf_samples, grid_centers, means, stds) >>> print(f"tail weight = {np.median(tw_samples):.4f}") """ pdf_samples = np.asarray(pdf_samples, dtype=float) grid_centers = np.asarray(grid_centers, dtype=float) means = np.asarray(means, dtype=float) stds = np.asarray(stds, dtype=float) delta = grid_centers[np.newaxis, :] - means[:, np.newaxis] # (n_s, n_bins) outside = np.abs(delta) > stds[:, np.newaxis] # bool (n_s, n_bins) return np.sum(pdf_samples * outside, axis=1) # (n_samples,)
[docs] def bimodality_score(pdf_samples): """ Count the number of peaks in the smoothed posterior-mean LOSVD. This is a diagnostic integer flag, not a posterior quantity. It is computed from the posterior-mean LOSVD (not from individual samples), so no credible interval is available. Bins with a score of 2 or more indicate a potentially multimodal distribution where GH-analogue summary statistics (mean, sigma, skewness, kurtosis) may be misleading and the full histogram shape should be inspected. Peak detection uses a 3-point boxcar to smooth over single-bin noise, and requires each candidate peak to exceed 1% of the global maximum to suppress spurious detections in low-probability tails. Parameters ---------- pdf_samples : array-like, shape (n_samples, n_bins) MCMC samples of the probability mass function. Each row sums to 1. Returns ------- int Number of local maxima detected. Typical values: ``1`` Unimodal distribution (normal case). ``2`` Bimodal — could indicate counter-rotation, two kinematic components, or a contaminating population. ``≥ 3`` Highly irregular; inspect visually before interpreting any scalar summaries. Notes ----- A more principled treatment would count peaks on every individual MCMC sample and return a distribution over the count. This is deferred as a future improvement; the posterior-mean approach is sufficient for identifying bins that require visual inspection. Examples -------- >>> score = bimodality_score(solver.samples["intrinsic_pdf"]) >>> if score >= 2: ... print("Multimodal — inspect full histogram before trusting moments") """ pdf_samples = np.asarray(pdf_samples, dtype=float) mean_pdf = np.mean(pdf_samples, axis=0) smoothed = np.convolve(mean_pdf, np.full(3, 1.0 / 3), mode="same") # Require each peak to exceed 1% of the global maximum to suppress # noise peaks in poorly constrained tail bins. min_height = 0.01 * smoothed.max() interior = smoothed[1:-1] left = smoothed[:-2] right = smoothed[2:] n_peaks = int( np.sum((interior > left) & (interior > right) & (interior > min_height)) ) return n_peaks
[docs] def half_68ci(samples): """ Half-width of the 68% posterior credible interval. Computes ``(p84 − p16) / 2`` — the symmetric ± uncertainty reported throughout veldist for both LOSVD values and scalar summary metrics. This matches the convention used by BayesLOSVD and is the natural non-parametric analogue of a 1-σ Gaussian error bar. Parameters ---------- samples : array-like, shape (n_samples,) Posterior samples of a scalar quantity (e.g., one metric evaluated over all MCMC draws). Returns ------- float ``(p84 − p16) / 2``, in the same units as *samples*. Notes ----- For a Gaussian posterior this equals the posterior standard deviation. For skewed or heavy-tailed posteriors it can differ substantially, but it always has the interpretation "the true value lies within ±half_68ci of the median with approximately 68% posterior probability." Examples -------- >>> v_mean_samples = pdf_samples @ grid_centers >>> uncertainty = half_68ci(v_mean_samples) >>> median = float(np.median(v_mean_samples)) >>> print(f"v_mean = {median:.1f} ± {uncertainty:.1f} km/s") """ samples = np.asarray(samples, dtype=float) p16, p84 = np.percentile(samples, [16, 84]) return float((p84 - p16) / 2.0)
# --------------------------------------------------------------------------- # Primary public API # ---------------------------------------------------------------------------
[docs] def compute_summary(pdf_samples, grid_centers): """ Compute spatially-mappable scalar summaries from posterior LOSVD samples. This is the primary analysis function for extracting kinematic maps from :class:`~veldist.KinematicSolver` output. All metrics except ``bimodality_score`` are evaluated independently on every MCMC sample, so the full posterior uncertainty — measurement noise, finite star count, and prior regularisation — is propagated automatically with no separate bootstrap or error-propagation step. Each metric is summarised as ``(posterior_median, half_68ci)`` using the same convention as the LOSVD itself: the reported uncertainty is ``(p84 − p16) / 2``. Parameters ---------- pdf_samples : array-like, shape (n_samples, n_bins) MCMC samples of the probability mass function. Each row must sum to 1. grid_centers : array-like, shape (n_bins,) Centres of the velocity bins (km/s or consistent velocity unit). Returns ------- dict Each key maps to a ``(median, half_68ci)`` tuple of floats — both in the same units as *grid_centers* for velocity quantities, and dimensionless for shape metrics — **except** ``'bimodality_score'``, which is a plain ``int``. **Location** ``'v_mean'`` Flux-weighted mean velocity. GH analogue: *V*. Sensitive to tail contamination; compare with ``v_median`` as a cross-check. ``'v_median'`` Median velocity (CDF = 0.5). Robust against edge-bin contamination and heavy tails. ``'v_asymmetry'`` Mean minus median. Near zero for symmetric LOSVDs; the sign mirrors that of the low-velocity tail (positive = mean pulled toward higher velocities by a trailing tail). Closely related to *h₃*, but does not require computing higher-order moments. **Dispersion** ``'sigma'`` Standard deviation of the LOSVD. GH analogue: *σ*. ``'iqr'`` Interquartile range Q75 − Q25. Robust dispersion estimate insensitive to tail contamination. ``'sigma_iqr'`` IQR / 1.3490 — the Gaussian-equivalent dispersion derived from the IQR. For a Gaussian, ``sigma_iqr ≈ sigma``. ``sigma_iqr < sigma`` → heavy tails (radial anisotropy); ``sigma_iqr > sigma`` → flat top (tangential anisotropy). **Shape** ``'skewness'`` Normalised third central moment *γ₁*. Zero for symmetric distributions. GH analogue: *h₃* ≈ −*γ₁* / √6. Note sign: *γ₁* > 0 (right-skewed, trailing tail) → *h₃* < 0, the expected signal on the receding side of a rotating system. ``'kurtosis'`` Excess kurtosis *κ* = fourth central moment / σ⁴ − 3. Zero for a Gaussian. GH analogue: *h₄* ≈ *κ* / √24. Positive (leptokurtic) → radially anisotropic; negative (platykurtic) → tangentially anisotropic / flat-topped. ``'tail_weight'`` Fraction of probability mass outside ±1*σ* of the mean. Gaussian reference: 0.3173. A more direct anisotropy diagnostic than *h₄*: no expansion assumption, always interpretable even for non-Gaussian shapes. See also :func:`tail_weight`. **Diagnostic** ``'bimodality_score'`` Integer number of peaks in the smoothed posterior-mean LOSVD (see :func:`bimodality_score`). Score 1 = unimodal; ≥ 2 = inspect visually. No uncertainty is returned for this metric. Notes ----- The approximate Gauss-Hermite conversions (valid for |h₃|, |h₄| ≲ 0.2): .. code-block:: text h3 ≈ -skewness / sqrt(6) h4 ≈ kurtosis / sqrt(24) These allow cross-validation against GH-based models and literature maps. For spatial bins where ``bimodality_score ≥ 2``, the mean, sigma, skewness, and kurtosis should be treated with caution: the mean lands between two peaks, sigma is inflated by their separation, and skewness reflects which peak is taller rather than any genuine asymmetry. Examples -------- >>> solver = KinematicSolver(v_grid) >>> solver.add_data(velocities, uncertainties) >>> solver.run() >>> summary = compute_summary(solver.samples["intrinsic_pdf"], ... solver.grid["centers"]) >>> v, dv = summary["v_mean"] >>> s, ds = summary["sigma"] >>> print(f"V = {v:.1f} ± {dv:.1f} σ = {s:.1f} ± {ds:.1f} km/s") """ pdf_samples = np.asarray(pdf_samples, dtype=float) # (n_samples, n_bins) grid_centers = np.asarray(grid_centers, dtype=float) # (n_bins,) # ------------------------------------------------------------------ # Moment-based quantities (fully vectorised) # ------------------------------------------------------------------ means = pdf_samples @ grid_centers # (n_samples,) delta = grid_centers[np.newaxis, :] - means[:, np.newaxis] # (n_s, n_bins) variance = np.einsum("ij,ij->i", pdf_samples, delta**2) # (n_samples,) stds = np.sqrt(variance) # (n_samples,) safe_stds = np.where(stds > 0, stds, 1.0) skews = np.einsum("ij,ij->i", pdf_samples, delta**3) / safe_stds**3 skews = np.where(stds > 0, skews, 0.0) kurts = (np.einsum("ij,ij->i", pdf_samples, delta**4) / safe_stds**4) - 3.0 kurts = np.where(stds > 0, kurts, 0.0) tw = tail_weight(pdf_samples, grid_centers, means, stds) # (n_samples,) # ------------------------------------------------------------------ # CDF-based quantities (loop over samples; fast for ~1 000 draws) # ------------------------------------------------------------------ # Single call returns (n_samples, 3) for [Q25, Q50, Q75] pctls = cdf_percentile(pdf_samples, grid_centers, np.array([0.25, 0.50, 0.75])) q25, medians, q75 = pctls[:, 0], pctls[:, 1], pctls[:, 2] iqr = q75 - q25 # (n_samples,) sigma_iqr = iqr / 1.3490 # Gaussian-equivalent σ v_asym = means - medians # (n_samples,) # ------------------------------------------------------------------ # Bimodality score (scalar — from posterior mean, not per-sample) # ------------------------------------------------------------------ bscore = bimodality_score(pdf_samples) # ------------------------------------------------------------------ # Summarise each per-sample array as (median, half_68ci) # ------------------------------------------------------------------ def _summarise(arr): p16, p50, p84 = np.percentile(arr, [16, 50, 84]) return (float(p50), float((p84 - p16) / 2.0)) return { # Location "v_mean": _summarise(means), "v_median": _summarise(medians), "v_asymmetry": _summarise(v_asym), # Dispersion "sigma": _summarise(stds), "iqr": _summarise(iqr), "sigma_iqr": _summarise(sigma_iqr), # Shape "skewness": _summarise(skews), "kurtosis": _summarise(kurts), "tail_weight": _summarise(tw), # Diagnostic (no uncertainty) "bimodality_score": bscore, }
[docs] def compute_summary_maps(solvers): """ Compute summary statistics for all solved bins from :func:`~veldist.fit_all_bins`. Iterates over a list of :class:`~veldist.KinematicSolver` instances, calls :func:`compute_summary` on each one, and assembles the results into arrays shaped ``(n_bins,)`` — one entry per spatial bin — ready to pass directly to a spatial map plotting function. Bins that were skipped during inference (``None`` entries in *solvers*, as returned for bins below ``min_stars`` by :func:`~veldist.fit_all_bins`) produce ``NaN`` in all output arrays so that the spatial indexing is preserved. Parameters ---------- solvers : list of :class:`~veldist.KinematicSolver` or None As returned by :func:`~veldist.fit_all_bins`. ``None`` entries (skipped bins) are silently mapped to ``NaN``. The list must contain at least one non-``None`` entry. Returns ------- dict One key per metric name from :func:`compute_summary`. Each value is a sub-dict with two keys: ``'median'`` : ndarray, shape (n_bins,) Posterior median of the metric. ``NaN`` for skipped bins. For ``bimodality_score``, the integer score is cast to float. ``'uncertainty'`` : ndarray, shape (n_bins,) Half-width of the 68% credible interval. ``NaN`` for skipped bins and for ``bimodality_score`` (which has no posterior CI). Raises ------ ValueError If all entries in *solvers* are ``None``. Examples -------- Build velocity and dispersion maps and plot them side by side: >>> maps = compute_summary_maps(solvers) >>> import matplotlib.pyplot as plt >>> fig, axes = plt.subplots(1, 2, figsize=(10, 4)) >>> for ax, key, label in zip(axes, ... ["v_mean", "sigma"], ... ["Mean velocity (km/s)", "Dispersion (km/s)"]): ... sc = ax.scatter(xbin, ybin, c=maps[key]["median"], cmap="RdBu_r") ... plt.colorbar(sc, ax=ax, label=label) Flag multimodal bins before plotting moments: >>> bscore = maps["bimodality_score"]["median"] >>> multimodal = bscore >= 2 >>> print(f"{multimodal.sum():.0f} bins have bimodality_score ≥ 2") Notes ----- The ``bimodality_score`` sub-dict has ``'median'`` filled with the integer score cast to float and ``'uncertainty'`` filled with ``NaN``. This keeps the return structure uniform so callers can iterate over all keys without special-casing the diagnostic metric. """ n_bins = len(solvers) # Infer the full set of metric keys from the first solved bin. first = next((s for s in solvers if s is not None), None) if first is None: raise ValueError("All solvers are None — no bins to summarise.") ref_summary = compute_summary(first.samples["intrinsic_pdf"], first.grid["centers"]) metric_keys = list(ref_summary.keys()) # Initialise all arrays to NaN. maps = { k: {"median": np.full(n_bins, np.nan), "uncertainty": np.full(n_bins, np.nan)} for k in metric_keys } for i, solver in enumerate(solvers): if solver is None: continue summary = compute_summary( solver.samples["intrinsic_pdf"], solver.grid["centers"] ) for k, v in summary.items(): if k == "bimodality_score": maps[k]["median"][i] = float(v) # uncertainty stays NaN — bimodality_score has no posterior CI else: maps[k]["median"][i] = v[0] maps[k]["uncertainty"][i] = v[1] return maps