Three layers: a short note at the top, the key lines with our take in the middle, the full source at the bottom.
Test
test_golden.py
The golden tests. Every invoice fixture must extract to its committed expected output; any drift fails CI.
Repo path services/extract/tests/test_golden.pyLanguage Python
Short note — more on the way
What this is
The golden tests. Every invoice fixture must extract to its committed expected output; any drift fails CI.
What it proves
This file backs one or more of the privacy promises. It is a test file that lives versioned in the repository. Read the promise →
What to look for in the source below
- Comments and headers that name what each section does.
- File edges: imports at the top, exports or run-blocks at the bottom.
- Any list, configuration, or assertion that looks load-bearing.
Show the full file (220 lines)
219 lines
"""Pytest harness for the synthetic golden-set fixtures.
Runs every GoldenCase in services/extract/tests/golden/cases.py through
the full deterministic engine (with seeded synonyms + vendor catalog)
and asserts:
- classification.doc_type matches expected
- classification.locale.currency matches expected
- extraction_method matches expected
- drift_decision.reason.value matches expected
- needs_review_reasons CONTAINS each expected substring (superset check)
- every invoice_assertion predicate evaluates True
Cases that depend on PR-5's tax_breakdown/fees assembly are excluded
from this PR; they land alongside that work so the harness stays
green throughout.
PR-6 will wire this test as a CI determinism gate (run twice, byte-
identical output) per the v4 plan §Verification.
"""
from __future__ import annotations
import pytest
from adapter import translate
from engine import extract
from template_store import InMemoryTemplateStore
from tests.golden.cases import CASES, make_catalog, make_synonyms
@pytest.mark.parametrize("case", CASES, ids=lambda c: c.name)
def test_golden_case(case):
"""One pytest invocation per GoldenCase. Fresh stores per case
so state doesn't bleed between tests."""
template_store = InMemoryTemplateStore()
synonym_store = make_synonyms()
vendor_resolver = make_catalog()
cells = translate(case.docling_doc)
result = extract(
cells=cells,
org_id="org_golden",
document_id=f"doc_{case.name}",
template_store=template_store,
synonym_store=synonym_store,
vendor_resolver=vendor_resolver,
docling_doc_for_classify=case.docling_doc,
)
# Doc-type assertion (engine derives this from classify())
actual_doc_type = result.invoice.get(
"_doc_type" # not in the schema; we check classification via the result
)
# Currency
actual_currency = result.invoice["currency"]["value"]
assert actual_currency == case.expected_currency, (
f"[{case.name}] currency: expected {case.expected_currency}, "
f"got {actual_currency}"
)
# Extraction method
assert result.extraction_method == case.expected_extraction_method, (
f"[{case.name}] extraction_method: expected "
f"{case.expected_extraction_method}, got {result.extraction_method}"
)
# Drift reason
assert result.drift_decision.reason.value == case.expected_drift_reason, (
f"[{case.name}] drift_reason: expected "
f"{case.expected_drift_reason}, got {result.drift_decision.reason.value}"
)
# Review reasons SUPERSET (all expected must be present; extras allowed)
for expected_sub in case.expected_review_reasons_subset:
matched = any(
r == expected_sub or r.startswith(expected_sub)
for r in result.needs_review_reasons
)
assert matched, (
f"[{case.name}] expected review reason '{expected_sub}' not "
f"found in {result.needs_review_reasons}"
)
# Invoice predicate assertions
for description, predicate in case.invoice_assertions:
assert predicate(result.invoice), (
f"[{case.name}] invoice assertion failed: {description}\n"
f"invoice = {result.invoice}"
)
def test_classification_doc_type_propagates_to_engine():
"""Direct check that classify() output reaches the engine and
drives doc-type-dependent behavior. The CASES test above checks
via review reasons; this test pins the contract directly.
"""
from classify import classify
for case in CASES:
classification = classify(case.docling_doc)
assert classification.doc_type == case.expected_doc_type, (
f"[{case.name}] classify().doc_type: expected "
f"{case.expected_doc_type}, got {classification.doc_type}"
)
def test_determinism_run_twice_identical():
"""Run the entire golden set twice; assertions about extraction_method,
drift_reason, review_reasons, and currency must match across runs.
This is the PR-6 determinism CI gate's contract. Bytes-identical
output across runs requires PYTHONHASHSEED control which the CI
job will set; this in-process check covers the engine's logic
being deterministic regardless of dict iteration order.
"""
template_store_a = InMemoryTemplateStore()
template_store_b = InMemoryTemplateStore()
for case in CASES:
synonym_store = make_synonyms()
vendor_resolver = make_catalog()
cells = translate(case.docling_doc)
result_a = extract(
cells=cells,
org_id="org_golden",
document_id=f"doc_{case.name}",
template_store=template_store_a,
synonym_store=synonym_store,
vendor_resolver=vendor_resolver,
docling_doc_for_classify=case.docling_doc,
)
result_b = extract(
cells=cells,
org_id="org_golden",
document_id=f"doc_{case.name}",
template_store=template_store_b,
synonym_store=synonym_store,
vendor_resolver=vendor_resolver,
docling_doc_for_classify=case.docling_doc,
)
assert result_a.layout_hash == result_b.layout_hash, case.name
assert result_a.extraction_method == result_b.extraction_method, case.name
assert (
result_a.drift_decision.reason
== result_b.drift_decision.reason
), case.name
# template_id differs (random uuid per call); skip here --
# the byte-equal test below uses freeze_runtime to lock the
# uuid + wallclock so template_id matches across runs and the
# full invoice dict serialises identically.
assert (
result_a.reconciliation.residual_cents
== result_b.reconciliation.residual_cents
), case.name
assert result_a.ein_name_disagreement == result_b.ein_name_disagreement, case.name
def test_byte_equal_determinism_under_frozen_runtime():
"""B-qa-2: with the runtime frozen (clock + uuid generator),
two runs of the engine over the golden corpus produce
byte-for-byte identical JSON. This is the actual PR-6
determinism contract; the prior weaker check skipped
template_id because uuid4() per call was non-deterministic.
"""
import json
from _runtime import freeze_runtime
def run_once() -> list[str]:
out = []
store = InMemoryTemplateStore()
with freeze_runtime(at="2026-05-11T00:00:00+00:00", seed=0):
for case in CASES:
synonym_store = make_synonyms()
vendor_resolver = make_catalog()
cells = translate(case.docling_doc)
result = extract(
cells=cells,
org_id="org_golden",
document_id=f"doc_{case.name}",
template_store=store,
synonym_store=synonym_store,
vendor_resolver=vendor_resolver,
docling_doc_for_classify=case.docling_doc,
)
# Serialise the full invoice dict with sorted keys.
out.append(
json.dumps(
result.invoice,
sort_keys=True,
default=str,
)
)
return out
run_a = run_once()
run_b = run_once()
assert len(run_a) == len(run_b)
for case, sa, sb in zip(CASES, run_a, run_b):
assert sa == sb, (
f"[{case.name}] byte-equal determinism gate failed.\n"
f" run_a: {sa[:160]}...\n"
f" run_b: {sb[:160]}..."
)
def test_case_count_is_at_least_eleven():
"""If the count drops below 11 someone removed regression coverage
without thinking about it. The minimum bar covers every operator-
pressure-test broken-scenario class plus the post-audit fixes.
"""
assert len(CASES) >= 11, (
f"Golden set has {len(CASES)} cases; minimum 11 covers the "
f"regression-guard bar. Adding cases is fine; removing them "
f"requires updating this assertion AND documenting why."
)This is the file as it lives at the moment of this build. The canonical history lives in git. If you want the full history or a specific commit, write to hello@muntin.digital.