Skip to content

cross-judge

Multi-vendor LLM ensemble-judge framework — majority / unanimous / Krippendorff-α voting across heterogeneous models.

pip install cross-judge

Quick start

from cross_judge import Critic, Ensemble

critics = [
    Critic(name="claude-strict", model="anthropic/claude-sonnet-4.5", vendor="openrouter"),
    Critic(name="ds-pro-creative", model="deepseek-v4-pro", vendor="deepseek", temperature=0.7),
    Critic(name="kimi-rigor", model="moonshot/kimi-k2", vendor="openrouter"),
]
ensemble = Ensemble(critics=critics, voting="majority")
result = ensemble.judge(query="Is this isomorphic to power-law tail scaling?")
print(result.consensus, result.krippendorff_alpha, result.agreement_pct)

Core API (v0.1)

Critic dataclass

Critic(name: str, model: str, prompt_template: str = 'Judge the following query:\n{query}\n\nOutput JSON with kind (KEEP/REJECT/SPLIT/MERGE), confidence (0-1), reasoning.', vendor: str = 'deepseek', system_prompt: str = 'You are a careful judge. Output strict JSON only.', temperature: float = 0.0, max_tokens: int = 2000, api_key: str | None = None, base_url: str | None = None, http_client: Any = None, timeout: float = 60.0)

One LLM critic configuration.

Parameters:

Name Type Description Default
name str

unique critic identifier (e.g. 'claude-strict', 'deepseek-creative'). Used in EnsembleVerdict.verdicts and disagreement diagnostics.

required
model str

vendor model id (e.g. 'deepseek-v4-pro', 'gpt-4o', 'anthropic/claude-sonnet-4.5' via openrouter).

required
prompt_template str

A str.format() template with {query} and any keys from the context dict passed to .judge(). Must instruct the LLM to return strict JSON with verdict (or kind), confidence, reasoning (or rationale) fields.

'Judge the following query:\n{query}\n\nOutput JSON with kind (KEEP/REJECT/SPLIT/MERGE), confidence (0-1), reasoning.'
vendor str

'deepseek' (default) / 'openai' / 'openrouter' / 'custom'. For 'custom', pass base_url explicitly.

'deepseek'
system_prompt str

optional system message ('' = none).

'You are a careful judge. Output strict JSON only.'
temperature float

sampling temperature (default 0.0 for deterministic judging).

0.0
max_tokens int

output cap.

2000
api_key str | None

explicit API key (else read from env var per vendor).

None
base_url str | None

explicit base URL override.

None
http_client Any

inject an httpx.Client (or compatible mock) for testing.

None
timeout float

per-request timeout in seconds.

60.0
Example

critic = Critic( name="claude-strict", model="anthropic/claude-sonnet-4.5", vendor="openrouter", prompt_template="Judge: {query}\nOutput JSON: ...", ) v = critic.judge("Is this isomorphic to power-law?", context={})

judge

judge(query: str, context: dict[str, Any] | None = None) -> Verdict

Run this critic on one query and return a Verdict.

Parameters:

Name Type Description Default
query str

the item to judge.

required
context dict[str, Any] | None

additional template variables (merged into prompt_template).

None

Returns:

Type Description
Verdict

Verdict — kind/confidence/reasoning. Errors surfaced as

Verdict

Verdict(kind='ERROR', error=...) rather than raised exceptions.

Source code in packages/cross-judge/src/cross_judge/core.py
def judge(self, query: str, context: dict[str, Any] | None = None) -> Verdict:
    """Run this critic on one query and return a Verdict.

    Args:
        query: the item to judge.
        context: additional template variables (merged into prompt_template).

    Returns:
        Verdict — kind/confidence/reasoning. Errors surfaced as
        Verdict(kind='ERROR', error=...) rather than raised exceptions.
    """
    user_prompt = self._render_prompt(query, context)
    client = self._get_client()
    url = self._resolved_base_url().rstrip("/") + "/chat/completions"

    messages = []
    if self.system_prompt:
        messages.append({"role": "system", "content": self.system_prompt})
    messages.append({"role": "user", "content": user_prompt})

    payload = {
        "model": self.model,
        "messages": messages,
        "temperature": self.temperature,
        "max_tokens": self.max_tokens,
    }

    t0 = time.time()
    raw: str | None = None
    try:
        api_key = self._resolved_api_key()
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }
        resp = client.post(url, json=payload, headers=headers)
        if hasattr(resp, "raise_for_status"):
            resp.raise_for_status()
        data = resp.json()
        raw = (data["choices"][0]["message"]["content"] or "").strip()
    except Exception as e:
        return Verdict(
            kind="ERROR",
            confidence=0.0,
            reasoning=f"{type(e).__name__}: {e}",
            critic_id=self.name,
            raw_response=None,
            error=f"{type(e).__name__}: {e}",
            elapsed_s=round(time.time() - t0, 3),
        )

    elapsed = round(time.time() - t0, 3)
    if not raw:
        return Verdict(
            kind="ERROR",
            confidence=0.0,
            reasoning="empty response from LLM",
            critic_id=self.name,
            raw_response=None,
            error="empty_response",
            elapsed_s=elapsed,
        )

    return self._parse_verdict(raw, elapsed)

from_yaml_prompt classmethod

from_yaml_prompt(name: str, model: str, yaml_path: str, **kwargs: Any) -> 'Critic'

Build a Critic from a YAML prompt file shipped under prompts/.

YAML schema (versioned): version: "0.1" system_prompt: "..." user_prompt_template: "Judge: {query}\n..."

Source code in packages/cross-judge/src/cross_judge/core.py
@classmethod
def from_yaml_prompt(
    cls,
    name: str,
    model: str,
    yaml_path: str,
    **kwargs: Any,
) -> "Critic":
    """Build a Critic from a YAML prompt file shipped under prompts/.

    YAML schema (versioned):
        version: "0.1"
        system_prompt: "..."
        user_prompt_template: "Judge: {query}\\n..."
    """
    import yaml

    with open(yaml_path, encoding="utf-8") as f:
        data = yaml.safe_load(f)
    return cls(
        name=name,
        model=model,
        prompt_template=data.get("user_prompt_template", ""),
        system_prompt=data.get("system_prompt", "You are a careful judge."),
        **kwargs,
    )

Ensemble dataclass

Ensemble(critics: list[Critic], voting: str | VotingStrategy = 'majority', voting_kwargs: dict[str, Any] = dict())

A panel of Critics + a voting strategy.

Parameters:

Name Type Description Default
critics list[Critic]

list of Critic instances. All critics judge every query.

required
voting str | VotingStrategy

'majority' (default) | 'unanimous' | custom callable returning (consensus_label, disagreement_bool).

'majority'
voting_kwargs dict[str, Any]

passed through to the voting strategy (e.g. priority=['REJECT', 'KEEP'] for tie-breaking).

dict()

judge

judge(query: str, *, query_id: str | None = None, context: dict[str, Any] | None = None, meta: dict[str, Any] | None = None) -> EnsembleVerdict

Judge a query with all critics and aggregate consensus.

Parameters:

Name Type Description Default
query str

the item to judge.

required
query_id str | None

optional explicit identifier (defaults to query truncated to 80 chars).

None
context dict[str, Any] | None

extra template variables passed to each Critic.

None
meta dict[str, Any] | None

caller-supplied metadata pass-through.

None

Returns:

Type Description
EnsembleVerdict

EnsembleVerdict — per-critic verdicts + consensus + agreement +

EnsembleVerdict

Krippendorff α.

Source code in packages/cross-judge/src/cross_judge/ensemble.py
def judge(
    self,
    query: str,
    *,
    query_id: str | None = None,
    context: dict[str, Any] | None = None,
    meta: dict[str, Any] | None = None,
) -> EnsembleVerdict:
    """Judge a query with all critics and aggregate consensus.

    Args:
        query: the item to judge.
        query_id: optional explicit identifier (defaults to query truncated to 80 chars).
        context: extra template variables passed to each Critic.
        meta: caller-supplied metadata pass-through.

    Returns:
        EnsembleVerdict — per-critic verdicts + consensus + agreement +
        Krippendorff α.
    """
    verdicts: list[Verdict] = []
    for c in self.critics:
        verdicts.append(c.judge(query, context=context))
    return self.aggregate_verdicts(
        verdicts,
        query_id=query_id or query[:80],
        meta=meta,
    )

aggregate_verdicts

aggregate_verdicts(verdicts: list[Verdict], *, query_id: str, meta: dict[str, Any] | None = None) -> EnsembleVerdict

Aggregate a precomputed list of verdicts (useful for parallel-call orchestration outside this class).

Source code in packages/cross-judge/src/cross_judge/ensemble.py
def aggregate_verdicts(
    self,
    verdicts: list[Verdict],
    *,
    query_id: str,
    meta: dict[str, Any] | None = None,
) -> EnsembleVerdict:
    """Aggregate a precomputed list of verdicts (useful for parallel-call
    orchestration outside this class)."""
    strategy_fn = get_voting_strategy(self.voting)
    consensus, disagreement = strategy_fn(verdicts, **self.voting_kwargs)
    strategy_name = (
        self.voting if isinstance(self.voting, str) else getattr(self.voting, "__name__", "custom")
    )
    valid_confs = [v.confidence for v in verdicts if v.error is None]
    avg_conf = sum(valid_confs) / len(valid_confs) if valid_confs else 0.0
    return EnsembleVerdict(
        query_id=query_id,
        verdicts=verdicts,
        consensus=consensus,
        avg_confidence=avg_conf,
        disagreement=disagreement,
        agreement_pct=agreement_pct(verdicts, consensus),
        krippendorff_alpha=krippendorff_alpha(verdicts),
        voting=str(strategy_name),
        meta=meta or {},
    )

Verdict

Bases: BaseModel

A single critic's verdict on one query.

Attributes:

Name Type Description
kind str

VerdictKind label (KEEP / REJECT / SPLIT / MERGE / UNCLEAR / ...).

confidence float

0.0–1.0 self-reported confidence.

reasoning str

1–4 sentence rationale.

critic_id str

which critic produced this verdict.

raw_response str | None

the raw LLM response (for audit / debugging).

error str | None

error string if the call failed; kind will be 'ERROR'.

elapsed_s float

wall-clock seconds of the underlying LLM call.

The kind accepts free-form strings too (e.g. PASS / FAIL for code review), but the Literal type is the recommended vocabulary for B3/B4-style taxonomy review pipelines.

verdict property

verdict: str

Alias for kind (backward compat with legacy schema).

rationale property

rationale: str

Alias for reasoning (backward compat).

reviewer_id property

reviewer_id: str

Alias for critic_id (backward compat).

VerdictKind module-attribute

VerdictKind = Literal['KEEP', 'REJECT', 'SPLIT', 'MERGE', 'UNCLEAR', 'ERROR', 'PARSE_FAIL']

Default verdict vocabulary for the B3 / B4 universality-class review pattern.

KEEP — accept the candidate as-is. REJECT — discard the candidate (does not meet universality-class standards). SPLIT — accept but split into multiple sub-classes (composite candidate). MERGE — accept but merge with an existing class (duplicate / overlap). UNCLEAR / ERROR / PARSE_FAIL — fallback labels for partial / failed verdicts.

EnsembleVerdict

Bases: BaseModel

Aggregate result for one query across all critics in an ensemble.

Attributes:

Name Type Description
query_id str

caller-supplied identifier for the judged item.

verdicts list[Verdict]

per-critic Verdict list (one per critic, in input order).

consensus str

the rolled-up consensus label per the ensemble's voting strategy.

avg_confidence float

mean confidence across all non-errored verdicts.

disagreement bool

True if not all critics produced the same kind.

agreement_pct float

fraction of critics that agreed with the consensus label.

krippendorff_alpha float | None

Krippendorff's α inter-rater reliability coefficient (computed treating critics as raters and labels as nominal data).

voting str

name of the voting strategy used.

meta dict[str, Any]

caller-supplied metadata pass-through.

VENDOR_DEFAULTS module-attribute

VENDOR_DEFAULTS: dict[str, tuple[str, str]] = {'deepseek': ('https://api.deepseek.com/v1', 'DEEPSEEK_API_KEY'), 'openai': ('https://api.openai.com/v1', 'OPENAI_API_KEY'), 'openrouter': ('https://openrouter.ai/api/v1', 'OPENROUTER_API_KEY')}

Voting strategies

majority_vote

majority_vote(verdicts: list[Verdict], *, priority: list[str] | None = None, fallback: str = 'UNCLEAR') -> tuple[str, bool]

Majority vote: most common label wins.

Parameters:

Name Type Description Default
verdicts list[Verdict]

per-critic verdicts.

required
priority list[str] | None

tiebreaker order — labels earlier in the list win ties.

None
fallback str

returned if no valid verdicts.

'UNCLEAR'

Returns:

Type Description
tuple[str, bool]

(consensus_label, disagreement_bool)

Source code in packages/cross-judge/src/cross_judge/voting.py
def majority_vote(
    verdicts: list[Verdict],
    *,
    priority: list[str] | None = None,
    fallback: str = "UNCLEAR",
) -> tuple[str, bool]:
    """Majority vote: most common label wins.

    Args:
        verdicts: per-critic verdicts.
        priority: tiebreaker order — labels earlier in the list win ties.
        fallback: returned if no valid verdicts.

    Returns:
        (consensus_label, disagreement_bool)
    """
    counts = _label_counts(verdicts)
    if not counts:
        return fallback, False
    labels = {v.kind for v in verdicts if v.error is None}
    disagree = len(labels) > 1
    max_count = max(counts.values())
    tied = [label for label, c in counts.items() if c == max_count]
    if len(tied) == 1:
        return tied[0], disagree
    if priority:
        for p in priority:
            if p in tied:
                return p, disagree
    # else preserve insertion order
    seen: list[str] = []
    for v in verdicts:
        if v.error is None and v.kind in tied and v.kind not in seen:
            seen.append(v.kind)
    return (seen[0] if seen else tied[0]), disagree

unanimous

unanimous(verdicts: list[Verdict], *, fallback: str = 'UNCLEAR') -> tuple[str, bool]

Unanimous vote: return label only if all critics agree.

Parameters:

Name Type Description Default
verdicts list[Verdict]

per-critic verdicts.

required
fallback str

returned on any disagreement.

'UNCLEAR'

Returns:

Type Description
tuple[str, bool]

(consensus_label, disagreement_bool)

Source code in packages/cross-judge/src/cross_judge/voting.py
def unanimous(
    verdicts: list[Verdict],
    *,
    fallback: str = "UNCLEAR",
) -> tuple[str, bool]:
    """Unanimous vote: return label only if all critics agree.

    Args:
        verdicts: per-critic verdicts.
        fallback: returned on any disagreement.

    Returns:
        (consensus_label, disagreement_bool)
    """
    labels = {v.kind for v in verdicts if v.error is None}
    if len(labels) == 1:
        return labels.pop(), False
    return fallback, True

agreement_pct

agreement_pct(verdicts: list[Verdict], consensus: str) -> float

Fraction of valid critics whose kind == consensus.

Returns:

Type Description
float

float in [0.0, 1.0]. 0.0 if no valid verdicts.

Source code in packages/cross-judge/src/cross_judge/voting.py
def agreement_pct(verdicts: list[Verdict], consensus: str) -> float:
    """Fraction of valid critics whose kind == consensus.

    Returns:
        float in [0.0, 1.0]. 0.0 if no valid verdicts.
    """
    valid = _valid(verdicts)
    if not valid:
        return 0.0
    matching = sum(1 for v in valid if v.kind == consensus)
    return matching / len(valid)

krippendorff_alpha

krippendorff_alpha(verdicts: list[Verdict]) -> float | None

Krippendorff's α for nominal data.

Treats each critic as a rater and each label as nominal.

For the single-item, N-rater case: α = 1 - (D_observed / D_expected)

where

D_observed = number of disagreeing pairs of critics D_expected = sum over (cat_i, cat_j) of n_i * n_j (i != j) / (N-1) where n_i is the count of critics that voted cat_i, N is the total number of critics.

The (N-1) divisor in D_expected is the standard small-sample correction (Krippendorff 2011 eq. 4).

Returns:

Type Description
float | None

α in [-1.0, 1.0]. None if fewer than 2 valid verdicts.

float | None

1.0 → perfect agreement

float | None

0.0 → agreement equal to chance

float | None

<0.0 → systematic disagreement

Source code in packages/cross-judge/src/cross_judge/voting.py
def krippendorff_alpha(verdicts: list[Verdict]) -> float | None:
    """Krippendorff's α for nominal data.

    Treats each critic as a rater and each label as nominal.

    For the single-item, N-rater case:
        α = 1 - (D_observed / D_expected)

    where:
        D_observed = number of disagreeing pairs of critics
        D_expected = sum over (cat_i, cat_j) of n_i * n_j (i != j) / (N-1)
                     where n_i is the count of critics that voted cat_i,
                     N is the total number of critics.

    The (N-1) divisor in D_expected is the standard small-sample correction
    (Krippendorff 2011 eq. 4).

    Returns:
        α in [-1.0, 1.0]. None if fewer than 2 valid verdicts.
        1.0  → perfect agreement
        0.0  → agreement equal to chance
        <0.0 → systematic disagreement
    """
    valid = _valid(verdicts)
    n = len(valid)
    if n < 2:
        return None

    counts = _label_counts(valid)
    total = n

    # D_observed: count of disagreeing pairs out of C(N,2)
    # For each pair of critics (i, j), if their labels differ → +1
    # Equivalently: D_obs = C(N,2) - sum_c C(n_c, 2)
    total_pairs = n * (n - 1) // 2
    same_pairs = sum(c * (c - 1) // 2 for c in counts.values())
    disagreeing_pairs = total_pairs - same_pairs

    if total_pairs == 0:
        return None

    # D_observed normalized to per-pair rate
    d_observed = disagreeing_pairs / total_pairs

    # D_expected: probability two randomly chosen labels differ given marginals
    # = 1 - sum_c (n_c / N) * ((n_c - 1) / (N - 1))   (sampling without replacement)
    if total == 1:
        return None
    same_chance = sum((c / total) * ((c - 1) / (total - 1)) for c in counts.values())
    d_expected = 1.0 - same_chance

    if d_expected == 0.0:
        # All critics had to agree by structure (e.g. only 1 label observed)
        return 1.0 if d_observed == 0.0 else 0.0

    alpha = 1.0 - (d_observed / d_expected)
    # Clamp to [-1, 1] for numerical safety; α can go slightly negative for
    # systematic disagreement but should never exceed 1.0.
    return max(-1.0, min(1.0, alpha))

get_voting_strategy

get_voting_strategy(name_or_fn: str | VotingStrategy) -> VotingStrategy

Resolve a voting strategy by name or pass through if already callable.

Source code in packages/cross-judge/src/cross_judge/voting.py
def get_voting_strategy(name_or_fn: str | VotingStrategy) -> VotingStrategy:
    """Resolve a voting strategy by name or pass through if already callable."""
    if callable(name_or_fn):
        return name_or_fn  # type: ignore[return-value]
    key = str(name_or_fn).lower()
    if key not in VOTING_STRATEGIES:
        raise KeyError(
            f"Unknown voting strategy '{name_or_fn}'. Known: {sorted(VOTING_STRATEGIES)}."
        )
    return VOTING_STRATEGIES[key]

VOTING_STRATEGIES module-attribute

VOTING_STRATEGIES: dict[str, VotingStrategy] = {'majority': majority_vote, 'unanimous': unanimous}

VotingStrategy module-attribute

VotingStrategy = Callable[[list[Verdict]], tuple[str, bool]]

Vendor configuration

VENDORS module-attribute

VENDORS: dict[str, VendorConfig] = {'deepseek': VendorConfig(name='deepseek', base_url='https://api.deepseek.com/v1', api_key_env='DEEPSEEK_API_KEY'), 'openai': VendorConfig(name='openai', base_url='https://api.openai.com/v1', api_key_env='OPENAI_API_KEY'), 'openrouter': VendorConfig(name='openrouter', base_url='https://openrouter.ai/api/v1', api_key_env='OPENROUTER_API_KEY')}

VendorConfig dataclass

VendorConfig(name: str, base_url: str, api_key_env: str)

Vendor-specific connection settings for OpenAI-compatible APIs.

get_vendor

get_vendor(name: str) -> VendorConfig

Look up a vendor by name. Raises KeyError on unknown vendor.

Source code in packages/cross-judge/src/cross_judge/vendors.py
def get_vendor(name: str) -> VendorConfig:
    """Look up a vendor by name. Raises KeyError on unknown vendor."""
    key = name.lower()
    if key not in VENDORS:
        raise KeyError(
            f"Unknown vendor '{name}'. Known: {sorted(VENDORS)}. "
            f"For arbitrary OpenAI-compatible endpoints, build your own client and pass it directly."
        )
    return VENDORS[key]

make_client

make_client(vendor: str = 'deepseek', api_key: str | None = None, base_url: str | None = None) -> Any

Build an OpenAI-compatible client for a vendor.

Parameters:

Name Type Description Default
vendor str

One of 'deepseek', 'openai', 'openrouter' (default: deepseek).

'deepseek'
api_key str | None

Explicit API key. If None, read from the vendor's env var.

None
base_url str | None

Override base URL (useful for custom endpoints / mock servers).

None

Returns:

Type Description
Any

An openai.OpenAI client instance configured for the vendor.

Raises:

Type Description
RuntimeError

if the API key is missing.

ImportError

if openai package is not installed.

Source code in packages/cross-judge/src/cross_judge/vendors.py
def make_client(vendor: str = "deepseek", api_key: str | None = None, base_url: str | None = None) -> Any:
    """Build an OpenAI-compatible client for a vendor.

    Args:
        vendor: One of 'deepseek', 'openai', 'openrouter' (default: deepseek).
        api_key: Explicit API key. If None, read from the vendor's env var.
        base_url: Override base URL (useful for custom endpoints / mock servers).

    Returns:
        An `openai.OpenAI` client instance configured for the vendor.

    Raises:
        RuntimeError: if the API key is missing.
        ImportError: if `openai` package is not installed.
    """
    try:
        from openai import OpenAI
    except ImportError as e:
        raise ImportError(
            "The 'openai' package is required for cross-judge LLM calls. "
            "Install with: pip install 'openai>=1.0'"
        ) from e

    cfg = get_vendor(vendor)
    key = api_key or os.getenv(cfg.api_key_env)
    if not key:
        raise RuntimeError(
            f"Missing API key for vendor '{vendor}'. "
            f"Set {cfg.api_key_env} env var or pass api_key=... explicitly."
        )
    return OpenAI(api_key=key, base_url=base_url or cfg.base_url)

Legacy API (v4 backwards compat)

Reviewer dataclass

Reviewer(reviewer_id: str, model: str, vendor: str = 'deepseek', system_prompt: str = 'You are a careful judge. Output strict JSON only.', temperature: float = 0.0, max_tokens: int = 2000, weight: float = 1.0, client: Any = None, api_key: str | None = None, base_url: str | None = None)

One LLM reviewer configuration.

ask

ask(user_prompt: str) -> Verdict

Call the LLM once and return a parsed Verdict.

Network / parse errors are caught and surfaced as Verdict(error=..., verdict='ERROR'). Callers can decide whether to retry or skip.

Source code in packages/cross-judge/src/cross_judge/reviewer.py
def ask(self, user_prompt: str) -> Verdict:
    """Call the LLM once and return a parsed Verdict.

    Network / parse errors are caught and surfaced as Verdict(error=...,
    verdict='ERROR'). Callers can decide whether to retry or skip.
    """
    client = self._get_client()
    t0 = time.time()
    raw: str | None = None
    try:
        resp = client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=self.temperature,
            max_tokens=self.max_tokens,
        )
        # openai-python v1+ response shape; vendor responses follow the same schema
        choice = resp.choices[0]
        raw = (getattr(choice.message, "content", None) or "").strip()
    except Exception as e:
        return Verdict(
            reviewer_id=self.reviewer_id,
            verdict="ERROR",
            confidence=0.0,
            rationale=f"{type(e).__name__}: {e}",
            raw_response=None,
            error=f"{type(e).__name__}: {e}",
            elapsed_s=round(time.time() - t0, 3),
        )

    elapsed = round(time.time() - t0, 3)
    if not raw:
        return Verdict(
            reviewer_id=self.reviewer_id,
            verdict="ERROR",
            confidence=0.0,
            rationale="empty response from LLM",
            raw_response=None,
            error="empty_response",
            elapsed_s=elapsed,
        )

    parsed = _extract_json(raw)
    if parsed is None:
        return Verdict(
            reviewer_id=self.reviewer_id,
            verdict="PARSE_FAIL",
            confidence=0.0,
            rationale=raw[:300],
            raw_response=raw,
            error="parse_fail",
            elapsed_s=elapsed,
        )

    verdict_label = str(parsed.get("verdict", "UNCLEAR")).strip().upper()
    conf_val = parsed.get("confidence", 0.0)
    try:
        conf = float(conf_val) if conf_val is not None else 0.0
    except (TypeError, ValueError):
        conf = 0.0
    conf = max(0.0, min(1.0, conf))
    rationale = str(parsed.get("rationale", ""))[:1000]
    return Verdict(
        reviewer_id=self.reviewer_id,
        verdict=verdict_label,
        confidence=conf,
        rationale=rationale,
        raw_response=raw,
        error=None,
        elapsed_s=elapsed,
    )

JudgePanel dataclass

JudgePanel(reviewers: list[Reviewer], strategy: str | AggregationStrategy = 'majority', strategy_kwargs: dict[str, Any] = dict())

A panel of reviewers + an aggregation strategy.

ask

ask(item_id: str, user_prompt: str, *, meta: dict[str, Any] | None = None) -> EnsembleResult

Ask every reviewer to judge the item, then aggregate.

Source code in packages/cross-judge/src/cross_judge/panel.py
def ask(
    self,
    item_id: str,
    user_prompt: str,
    *,
    meta: dict[str, Any] | None = None,
) -> EnsembleResult:
    """Ask every reviewer to judge the item, then aggregate."""
    verdicts: list[Verdict] = []
    for r in self.reviewers:
        verdicts.append(r.ask(user_prompt))
    return self._aggregate(item_id, verdicts, meta or {})

aggregate_verdicts

aggregate_verdicts(item_id: str, verdicts: list[Verdict], *, meta: dict[str, Any] | None = None) -> EnsembleResult

Aggregate a precomputed list of verdicts (useful when calls were driven externally, e.g. via async / parallel orchestration).

Source code in packages/cross-judge/src/cross_judge/panel.py
def aggregate_verdicts(
    self,
    item_id: str,
    verdicts: list[Verdict],
    *,
    meta: dict[str, Any] | None = None,
) -> EnsembleResult:
    """Aggregate a precomputed list of verdicts (useful when calls were
    driven externally, e.g. via async / parallel orchestration)."""
    return self._aggregate(item_id, verdicts, meta or {})

EnsembleResult

Bases: BaseModel

Aggregate result for one item across all reviewers in a panel.

LegacyVerdict

Bases: BaseModel

A single reviewer's verdict for one item.

The verdict label vocabulary is caller-defined (e.g. KEEP/REJECT/UNCLEAR for taxonomy review, or PASS/FAIL/UNSURE for code review). The aggregation layer treats labels as opaque strings.

AggregationStrategy module-attribute

AggregationStrategy = Callable[[list[Verdict]], tuple[str, bool]]

majority

majority(verdicts: list[Verdict], *, priority: list[str] | None = None, fallback: str = 'UNCLEAR') -> tuple[str, bool]

Most common label wins. Ties broken by priority order (if given), else by first-seen order.

Parameters:

Name Type Description Default
verdicts list[Verdict]

per-reviewer verdicts.

required
priority list[str] | None

tiebreaker order (labels earlier in the list win ties).

None
fallback str

label to return if no valid verdicts exist.

'UNCLEAR'
Source code in packages/cross-judge/src/cross_judge/aggregation.py
def majority(
    verdicts: list[Verdict],
    *,
    priority: list[str] | None = None,
    fallback: str = "UNCLEAR",
) -> tuple[str, bool]:
    """Most common label wins. Ties broken by `priority` order (if given),
    else by first-seen order.

    Args:
        verdicts: per-reviewer verdicts.
        priority: tiebreaker order (labels earlier in the list win ties).
        fallback: label to return if no valid verdicts exist.
    """
    counts = _label_counts(verdicts)
    if not counts:
        return fallback, False
    disagree = _any_disagreement(verdicts)
    max_count = max(counts.values())
    tied = [label for label, c in counts.items() if c == max_count]
    if len(tied) == 1:
        return tied[0], disagree
    # tiebreaker
    if priority:
        for p in priority:
            if p in tied:
                return p, disagree
    # else preserve insertion order from `verdicts`
    seen_order: list[str] = []
    for v in verdicts:
        if v.error is None and v.verdict in tied and v.verdict not in seen_order:
            seen_order.append(v.verdict)
    return (seen_order[0] if seen_order else tied[0]), disagree

weighted

weighted(verdicts: list[Verdict], *, weights: dict[str, float] | None = None, use_confidence: bool = True, fallback: str = 'UNCLEAR') -> tuple[str, bool]

Weighted vote: each verdict contributes weight = (reviewer_weight × confidence).

Parameters:

Name Type Description Default
verdicts list[Verdict]

per-reviewer verdicts.

required
weights dict[str, float] | None

optional per-reviewer weight overrides keyed by reviewer_id. Reviewers not in the dict default to weight=1.0.

None
use_confidence bool

if True, multiply weight by verdict.confidence.

True
fallback str

returned if no valid verdicts.

'UNCLEAR'
Source code in packages/cross-judge/src/cross_judge/aggregation.py
def weighted(
    verdicts: list[Verdict],
    *,
    weights: dict[str, float] | None = None,
    use_confidence: bool = True,
    fallback: str = "UNCLEAR",
) -> tuple[str, bool]:
    """Weighted vote: each verdict contributes weight = (reviewer_weight × confidence).

    Args:
        verdicts: per-reviewer verdicts.
        weights: optional per-reviewer weight overrides keyed by reviewer_id.
                 Reviewers not in the dict default to weight=1.0.
        use_confidence: if True, multiply weight by verdict.confidence.
        fallback: returned if no valid verdicts.
    """
    weights = weights or {}
    scores: dict[str, float] = {}
    for v in verdicts:
        if v.error is not None:
            continue
        w = float(weights.get(v.reviewer_id, 1.0))
        if use_confidence:
            w *= max(0.0, min(1.0, v.confidence))
        scores[v.verdict] = scores.get(v.verdict, 0.0) + w
    if not scores:
        return fallback, False
    disagree = _any_disagreement(verdicts)
    best = max(scores.items(), key=lambda kv: kv[1])
    return best[0], disagree

first_disagreement

first_disagreement(verdicts: list[Verdict], *, disagree_label: str = 'DISAGREE', fallback: str = 'UNCLEAR') -> tuple[str, bool]

Returns disagree_label if any pair of reviewers differ; else the agreed label.

Source code in packages/cross-judge/src/cross_judge/aggregation.py
def first_disagreement(
    verdicts: list[Verdict],
    *,
    disagree_label: str = "DISAGREE",
    fallback: str = "UNCLEAR",
) -> tuple[str, bool]:
    """Returns `disagree_label` if any pair of reviewers differ; else the agreed label."""
    labels = {v.verdict for v in verdicts if v.error is None}
    if not labels:
        return fallback, False
    if len(labels) == 1:
        return labels.pop(), False
    return disagree_label, True

get_strategy

get_strategy(name_or_fn: str | AggregationStrategy) -> AggregationStrategy

Resolve a strategy by name (string) or pass through if already a callable.

Source code in packages/cross-judge/src/cross_judge/aggregation.py
def get_strategy(name_or_fn: str | AggregationStrategy) -> AggregationStrategy:
    """Resolve a strategy by name (string) or pass through if already a callable."""
    if callable(name_or_fn):
        return name_or_fn  # type: ignore[return-value]
    key = str(name_or_fn).lower()
    if key not in STRATEGIES:
        raise KeyError(f"Unknown strategy '{name_or_fn}'. Known: {sorted(STRATEGIES)}.")
    return STRATEGIES[key]

avg_confidence

avg_confidence(verdicts: list[Verdict]) -> float

Average confidence across valid (non-errored) verdicts.

Source code in packages/cross-judge/src/cross_judge/aggregation.py
def avg_confidence(verdicts: list[Verdict]) -> float:
    """Average confidence across valid (non-errored) verdicts."""
    valid = [v.confidence for v in verdicts if v.error is None]
    if not valid:
        return 0.0
    return sum(valid) / len(valid)