Source code for artefactual.scoring.entropy_methods.entropy_contributions
from collections.abc import Mapping, Sequence
from typing import Any
import numpy as np
from beartype import beartype
from numpy.typing import NDArray
EPSILON = 1e-12
[docs]
@beartype
def compute_entropy_contributions(logprobs: NDArray[np.floating] | Sequence[Any], k: int) -> NDArray[np.floating]:
"""Compute entropic contributions s_kj = -p_k log(p_k) for top-K logprobs using vectorized operations.
Args:
logprobs: A 2D array of shape (num_tokens, num_logprobs) containing log probabilities.
k: Number of top log probabilities to consider per token.
Returns:
A 2D array of shape (num_tokens, K) containing entropy contributions.
"""
if not isinstance(logprobs, np.ndarray):
if not logprobs:
logprobs = np.empty((0, 0), dtype=np.float32)
else:
# Handle potential ragged sequences by padding with -inf
max_len = max(len(row) for row in logprobs)
padded_logprobs = np.full((len(logprobs), max_len), -np.inf, dtype=np.float32)
for i, row in enumerate(logprobs):
if row:
vals = list(row.values()) if isinstance(row, Mapping) else row
# Handle objects with logprob attribute (e.g. vLLM Logprob objects)
vals = [v.logprob if hasattr(v, "logprob") else v for v in vals]
padded_logprobs[i, : len(vals)] = vals
logprobs = padded_logprobs
if logprobs.size == 0:
return np.empty((0, k), dtype=np.float32)
# Convert to probabilities (logprobs are in natural log, base e)
probs = np.exp(logprobs)
# Calculate entropy contributions: s = -p * log(p) = -exp(logp) * logp (logprobs are natural logs)
with np.errstate(divide="ignore", invalid="ignore"):
s = -probs * logprobs
s = np.nan_to_num(s, nan=0.0, posinf=0.0, neginf=0.0)
# Pad or truncate to k elements along the K dimension (axis=1)
num_tokens, num_logprobs = s.shape
if num_logprobs == k:
return s
s_kj = np.zeros((num_tokens, k), dtype=np.float32)
if num_logprobs < k:
s_kj[:, :num_logprobs] = s
else: # num_logprobs > k
s_kj[:, :] = s[:, :k]
return s_kj