Three layers: a short note at the top, the key lines with our take in the middle, the full source at the bottom.

Test

test_golden.py

The golden tests. Every invoice fixture must extract to its committed expected output; any drift fails CI.

Repo path services/extract/tests/test_golden.pyLanguage Python

Short note — more on the way

What this is

The golden tests. Every invoice fixture must extract to its committed expected output; any drift fails CI.

What it proves

This file backs one or more of the privacy promises. It is a test file that lives versioned in the repository. Read the promise →

What to look for in the source below

Comments and headers that name what each section does.
File edges: imports at the top, exports or run-blocks at the bottom.
Any list, configuration, or assertion that looks load-bearing.

Show the full file (220 lines)

219 lines

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219"""Pytest harness for the synthetic golden-set fixtures.

Runs every GoldenCase in services/extract/tests/golden/cases.py through
the full deterministic engine (with seeded synonyms + vendor catalog)
and asserts:
  - classification.doc_type matches expected
  - classification.locale.currency matches expected
  - extraction_method matches expected
  - drift_decision.reason.value matches expected
  - needs_review_reasons CONTAINS each expected substring (superset check)
  - every invoice_assertion predicate evaluates True

Cases that depend on PR-5's tax_breakdown/fees assembly are excluded
from this PR; they land alongside that work so the harness stays
green throughout.

PR-6 will wire this test as a CI determinism gate (run twice, byte-
identical output) per the v4 plan §Verification.
"""

from __future__ import annotations

import pytest

from adapter import translate
from engine import extract
from template_store import InMemoryTemplateStore
from tests.golden.cases import CASES, make_catalog, make_synonyms


@pytest.mark.parametrize("case", CASES, ids=lambda c: c.name)
def test_golden_case(case):
    """One pytest invocation per GoldenCase. Fresh stores per case
    so state doesn't bleed between tests."""
    template_store = InMemoryTemplateStore()
    synonym_store = make_synonyms()
    vendor_resolver = make_catalog()

    cells = translate(case.docling_doc)

    result = extract(
        cells=cells,
        org_id="org_golden",
        document_id=f"doc_{case.name}",
        template_store=template_store,
        synonym_store=synonym_store,
        vendor_resolver=vendor_resolver,
        docling_doc_for_classify=case.docling_doc,
    )

    # Doc-type assertion (engine derives this from classify())
    actual_doc_type = result.invoice.get(
        "_doc_type"  # not in the schema; we check classification via the result
    )
    # Currency
    actual_currency = result.invoice["currency"]["value"]
    assert actual_currency == case.expected_currency, (
        f"[{case.name}] currency: expected {case.expected_currency}, "
        f"got {actual_currency}"
    )

    # Extraction method
    assert result.extraction_method == case.expected_extraction_method, (
        f"[{case.name}] extraction_method: expected "
        f"{case.expected_extraction_method}, got {result.extraction_method}"
    )

    # Drift reason
    assert result.drift_decision.reason.value == case.expected_drift_reason, (
        f"[{case.name}] drift_reason: expected "
        f"{case.expected_drift_reason}, got {result.drift_decision.reason.value}"
    )

    # Review reasons SUPERSET (all expected must be present; extras allowed)
    for expected_sub in case.expected_review_reasons_subset:
        matched = any(
            r == expected_sub or r.startswith(expected_sub)
            for r in result.needs_review_reasons
        )
        assert matched, (
            f"[{case.name}] expected review reason '{expected_sub}' not "
            f"found in {result.needs_review_reasons}"
        )

    # Invoice predicate assertions
    for description, predicate in case.invoice_assertions:
        assert predicate(result.invoice), (
            f"[{case.name}] invoice assertion failed: {description}\n"
            f"invoice = {result.invoice}"
        )


def test_classification_doc_type_propagates_to_engine():
    """Direct check that classify() output reaches the engine and
    drives doc-type-dependent behavior. The CASES test above checks
    via review reasons; this test pins the contract directly.
    """
    from classify import classify

    for case in CASES:
        classification = classify(case.docling_doc)
        assert classification.doc_type == case.expected_doc_type, (
            f"[{case.name}] classify().doc_type: expected "
            f"{case.expected_doc_type}, got {classification.doc_type}"
        )


def test_determinism_run_twice_identical():
    """Run the entire golden set twice; assertions about extraction_method,
    drift_reason, review_reasons, and currency must match across runs.

    This is the PR-6 determinism CI gate's contract. Bytes-identical
    output across runs requires PYTHONHASHSEED control which the CI
    job will set; this in-process check covers the engine's logic
    being deterministic regardless of dict iteration order.
    """
    template_store_a = InMemoryTemplateStore()
    template_store_b = InMemoryTemplateStore()

    for case in CASES:
        synonym_store = make_synonyms()
        vendor_resolver = make_catalog()
        cells = translate(case.docling_doc)

        result_a = extract(
            cells=cells,
            org_id="org_golden",
            document_id=f"doc_{case.name}",
            template_store=template_store_a,
            synonym_store=synonym_store,
            vendor_resolver=vendor_resolver,
            docling_doc_for_classify=case.docling_doc,
        )
        result_b = extract(
            cells=cells,
            org_id="org_golden",
            document_id=f"doc_{case.name}",
            template_store=template_store_b,
            synonym_store=synonym_store,
            vendor_resolver=vendor_resolver,
            docling_doc_for_classify=case.docling_doc,
        )

        assert result_a.layout_hash == result_b.layout_hash, case.name
        assert result_a.extraction_method == result_b.extraction_method, case.name
        assert (
            result_a.drift_decision.reason
            == result_b.drift_decision.reason
        ), case.name
        # template_id differs (random uuid per call); skip here --
        # the byte-equal test below uses freeze_runtime to lock the
        # uuid + wallclock so template_id matches across runs and the
        # full invoice dict serialises identically.
        assert (
            result_a.reconciliation.residual_cents
            == result_b.reconciliation.residual_cents
        ), case.name
        assert result_a.ein_name_disagreement == result_b.ein_name_disagreement, case.name


def test_byte_equal_determinism_under_frozen_runtime():
    """B-qa-2: with the runtime frozen (clock + uuid generator),
    two runs of the engine over the golden corpus produce
    byte-for-byte identical JSON. This is the actual PR-6
    determinism contract; the prior weaker check skipped
    template_id because uuid4() per call was non-deterministic.
    """
    import json
    from _runtime import freeze_runtime

    def run_once() -> list[str]:
        out = []
        store = InMemoryTemplateStore()
        with freeze_runtime(at="2026-05-11T00:00:00+00:00", seed=0):
            for case in CASES:
                synonym_store = make_synonyms()
                vendor_resolver = make_catalog()
                cells = translate(case.docling_doc)
                result = extract(
                    cells=cells,
                    org_id="org_golden",
                    document_id=f"doc_{case.name}",
                    template_store=store,
                    synonym_store=synonym_store,
                    vendor_resolver=vendor_resolver,
                    docling_doc_for_classify=case.docling_doc,
                )
                # Serialise the full invoice dict with sorted keys.
                out.append(
                    json.dumps(
                        result.invoice,
                        sort_keys=True,
                        default=str,
                    )
                )
        return out

    run_a = run_once()
    run_b = run_once()

    assert len(run_a) == len(run_b)
    for case, sa, sb in zip(CASES, run_a, run_b):
        assert sa == sb, (
            f"[{case.name}] byte-equal determinism gate failed.\n"
            f"  run_a: {sa[:160]}...\n"
            f"  run_b: {sb[:160]}..."
        )


def test_case_count_is_at_least_eleven():
    """If the count drops below 11 someone removed regression coverage
    without thinking about it. The minimum bar covers every operator-
    pressure-test broken-scenario class plus the post-audit fixes.
    """
    assert len(CASES) >= 11, (
        f"Golden set has {len(CASES)} cases; minimum 11 covers the "
        f"regression-guard bar. Adding cases is fine; removing them "
        f"requires updating this assertion AND documenting why."
    )

This is the file as it lives at the moment of this build. The canonical history lives in git. If you want the full history or a specific commit, write to hello@muntin.digital.