diff --git a/explorer.qmd b/explorer.qmd index fd61702..f3dac00 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -1612,20 +1612,52 @@ zoomWatcher = { }, 250); } + // --- Busy-flag depth counter (#173 review round 2) --- + // + // body.classList 'explorer-busy' tracks "any change-triggered async + // work in flight." Without depth counting, overlapping handlers race: + // a fast handler's `finally` removes the class while a slower + // handler's loadRes / facet recompute is still running, defeating the + // whole point of the flag. Depth-counted: class is added on the + // 0 → 1 transition and removed on the 1 → 0 transition. + let _busyDepth = 0; + function busyAcquire() { + if (_busyDepth === 0) document.body.classList.add('explorer-busy'); + _busyDepth++; + } + function busyRelease() { + _busyDepth = Math.max(0, _busyDepth - 1); + if (_busyDepth === 0) document.body.classList.remove('explorer-busy'); + } + // --- Source filter change handler --- + // + // The body.classList 'explorer-busy' flag wraps every async work path + // out of this handler so external observers (Playwright tests, + // perf-smoke harnesses) can wait for "all triggered work has settled" + // without race conditions against the debounced facet recompute. The + // 300ms post-refreshFacetCounts wait is intentional: refreshFacetCounts + // schedules a 250ms debounce that THEN sets the .recomputing class on + // facet count spans; we hold the busy flag until that has fired so the + // .recomputing-clear poll downstream is meaningful. See #173 review. const resUrls = { 4: h3_res4_url, 6: h3_res6_url, 8: h3_res8_url }; document.getElementById('sourceFilter').addEventListener('change', async () => { - // Toggle visual state on labels - updateSourceLegendState(); - writeQueryState(); - if (mode === 'cluster') { - loading = false; // allow loadRes to run (gen counter discards stale results) - await loadRes(currentRes, resUrls[currentRes]); - } else { - cachedBounds = null; // force re-query - await loadViewportSamples(); + busyAcquire(); + try { + updateSourceLegendState(); + writeQueryState(); + if (mode === 'cluster') { + loading = false; + await loadRes(currentRes, resUrls[currentRes]); + } else { + cachedBounds = null; + await loadViewportSamples(); + } + refreshFacetCounts(); + await new Promise(r => setTimeout(r, 300)); + } finally { + busyRelease(); } - refreshFacetCounts(); }); // --- Material / Context / Specimen Type filter change handler --- @@ -1636,15 +1668,21 @@ zoomWatcher = { // surface the explanatory `#facetNote` so users understand the filter // takes effect at neighborhood zoom. See issue #156, Phase 1. const facetNote = document.getElementById('facetNote'); - function handleFacetFilterChange() { - const active = hasFacetFilters(); - if (facetNote) facetNote.style.display = (active && mode === 'cluster') ? 'block' : 'none'; - writeQueryState(); - if (mode === 'point') { - cachedBounds = null; - loadViewportSamples(); + async function handleFacetFilterChange() { + busyAcquire(); + try { + const active = hasFacetFilters(); + if (facetNote) facetNote.style.display = (active && mode === 'cluster') ? 'block' : 'none'; + writeQueryState(); + if (mode === 'point') { + cachedBounds = null; + await loadViewportSamples(); + } + refreshFacetCounts(); + await new Promise(r => setTimeout(r, 300)); + } finally { + busyRelease(); } - refreshFacetCounts(); } document.getElementById('materialFilterBody').addEventListener('change', handleFacetFilterChange); document.getElementById('contextFilterBody').addEventListener('change', handleFacetFilterChange); @@ -1779,6 +1817,8 @@ zoomWatcher = { const searchInput = document.getElementById('sampleSearch'); const searchResults = document.getElementById('searchResults'); + let _searchSeq = 0; + async function doSearch() { const term = searchInput.value.trim(); if (!term || term.length < 2) { @@ -1788,8 +1828,21 @@ zoomWatcher = { } writeQueryState(); searchResults.textContent = 'Searching...'; + + // Per-search perf instrumentation (#167). Captures cold/warm latency, + // result count, and bytes transferred from data.isamples.org during + // the search window. transferSize is 0 for cross-origin responses + // missing Timing-Allow-Origin; we fall back to encodedBodySize. + const searchId = ++_searchSeq; + const markStart = `search-${searchId}-start`; + const markEnd = `search-${searchId}-end`; + performance.mark(markStart); + const tStart = performance.now(); + const terms = searchTerms(term); + let resultsCount = 0; + let errorMessage = null; + try { - const terms = searchTerms(term); const searchWhere = textSearchWhere(terms, ['label', 'CAST(place_name AS VARCHAR)']); const score = textSearchScore(terms, [ { col: 'label', weight: 3 }, @@ -1805,6 +1858,7 @@ zoomWatcher = { ORDER BY relevance_score DESC, label LIMIT 50 `); + resultsCount = results.length; if (results.length === 0) { searchResults.textContent = `No results for "${term}"`; return; @@ -1855,6 +1909,78 @@ zoomWatcher = { } catch(err) { console.error("Search failed:", err); searchResults.textContent = `Search error: ${err.message}`; + errorMessage = err.message || String(err); + } finally { + performance.mark(markEnd); + try { performance.measure(`search-${searchId}`, markStart, markEnd); } catch (e) {} + const elapsedMs = performance.now() - tStart; + + // Per-URL byte data from data.isamples.org during the search + // window. transferSize is 0 cross-origin without Timing-Allow-Origin; + // encodedBodySize is reported as a fallback. Per-URL detail (rather + // than just summed bytes) lets analysis post-hoc-filter concurrent + // fetches that are not actually attributable to the search. + const seenUrls = []; + let transferBytes = 0; + let bodyBytes = 0; + try { + const entries = performance.getEntriesByType('resource'); + for (const e of entries) { + if (!e.name.startsWith(R2_BASE)) continue; + if (e.startTime < tStart || e.startTime > tStart + elapsedMs) continue; + seenUrls.push({ + name: e.name, + transfer_size: e.transferSize || 0, + body_size: e.encodedBodySize || 0, + }); + transferBytes += (e.transferSize || 0); + bodyBytes += (e.encodedBodySize || 0); + } + } catch (e) {} + + // Structured log for Playwright capture (#167). + try { + console.log(JSON.stringify({ + event: 'isamples.search', + id: searchId, + term: term, + terms_count: terms.length, + results_count: resultsCount, + elapsed_ms: Math.round(elapsedMs), + bytes_transfer: transferBytes, + bytes_body: bodyBytes, + seen_urls: seenUrls, + has_source_filter: getActiveSources().length !== SOURCE_VALUES.length, + has_facet_filter: hasFacetFilters(), + error: errorMessage, + })); + } catch (e) {} + + // Append a row to the ?perf=1 panel if it's open. The panel + // renders once at boot from existing performance.measure entries + // (perfPanel cell, ~:2010); this hooks each subsequent search + // so the panel stays current per the #173 review. + try { + const panel = document.getElementById('perfPanel'); + if (panel) { + const tbl = panel.querySelector('table'); + if (tbl) { + const fmt = (ms) => ms >= 1000 + ? (ms / 1000).toFixed(2) + ' s' + : Math.round(ms) + ' ms'; + const tr = document.createElement('tr'); + const labelCell = document.createElement('td'); + labelCell.style.cssText = 'padding:1px 8px 1px 0;color:#bbb;'; + labelCell.textContent = `search #${searchId}: "${term}" (${resultsCount})`; + const valCell = document.createElement('td'); + valCell.style.cssText = 'padding:1px 0;text-align:right;color:#a5d6a7;font-variant-numeric:tabular-nums;'; + valCell.textContent = fmt(elapsedMs); + tr.appendChild(labelCell); + tr.appendChild(valCell); + tbl.appendChild(tr); + } + } + } catch (e) {} } } @@ -1951,6 +2077,15 @@ perfPanel = { ['nav → first globe frame', mark('first-globe-frame')], ].filter(([, v]) => v != null); + // Append search timings if any have run by the time the panel renders + // (#167). Each search emits a structured console.log; the panel surface + // is purely informational here. + const searchMeasures = performance.getEntriesByType('measure') + .filter(e => e.name.startsWith('search-')); + for (const m of searchMeasures) { + rows.push([`search ${m.name.replace(/^search-/, '#')}`, m.duration]); + } + // Console table for CI / offline capture console.table(Object.fromEntries(rows.map(([k, v]) => [k, `${v.toFixed(0)} ms`]))); diff --git a/tests/search_baseline_2026-05-08.json b/tests/search_baseline_2026-05-08.json new file mode 100644 index 0000000..c06be25 --- /dev/null +++ b/tests/search_baseline_2026-05-08.json @@ -0,0 +1,314 @@ +{ + "site_url": "http://localhost:5860", + "captured_at_utc": "2026-05-08T14:47:22.699069+00:00", + "schema_version": 1, + "field_subset": "label+place_name (samples_map_lite.parquet)", + "queries": [ + { + "label": "single-common", + "term": "pottery", + "filters": {}, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "pottery", + "terms_count": 1, + "results_count": 50, + "elapsed_ms": 8683, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "pottery", + "terms_count": 1, + "results_count": 50, + "elapsed_ms": 1940, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "single-rare", + "term": "basalt", + "filters": {}, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "basalt", + "terms_count": 1, + "results_count": 50, + "elapsed_ms": 11824, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "basalt", + "terms_count": 1, + "results_count": 50, + "elapsed_ms": 2400, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "multi-term", + "term": "pottery Cyprus", + "filters": {}, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "pottery Cyprus", + "terms_count": 2, + "results_count": 0, + "elapsed_ms": 5071, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "pottery Cyprus", + "terms_count": 2, + "results_count": 0, + "elapsed_ms": 2560, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "no-hit", + "term": "xyzzyqqqplugh", + "filters": {}, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "xyzzyqqqplugh", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 4870, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "xyzzyqqqplugh", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 2554, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "wildcard-pct", + "term": "100%", + "filters": {}, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "100%", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 5178, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "100%", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 2574, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "wildcard-under", + "term": "_test", + "filters": {}, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "_test", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 18520, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "_test", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 2529, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "diacritic", + "term": "\u00c7atalh\u00f6y\u00fck", + "filters": {}, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "\u00c7atalh\u00f6y\u00fck", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 4415, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "\u00c7atalh\u00f6y\u00fck", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 2592, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": false, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "composed-source", + "term": "pottery", + "filters": { + "source_only": [ + "OPENCONTEXT" + ] + }, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "pottery", + "terms_count": 1, + "results_count": 50, + "elapsed_ms": 4964, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": true, + "has_facet_filter": false, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "pottery", + "terms_count": 1, + "results_count": 50, + "elapsed_ms": 1368, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": true, + "has_facet_filter": false, + "error": null + } + }, + { + "label": "composed-source-material", + "term": "pottery", + "filters": { + "source_only": [ + "OPENCONTEXT" + ], + "material_first_n": 1 + }, + "cold": { + "event": "isamples.search", + "id": 1, + "term": "pottery", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 4331, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": true, + "has_facet_filter": true, + "error": null + }, + "warm": { + "event": "isamples.search", + "id": 2, + "term": "pottery", + "terms_count": 1, + "results_count": 0, + "elapsed_ms": 2106, + "bytes_transfer": 0, + "bytes_body": 0, + "seen_urls": [], + "has_source_filter": true, + "has_facet_filter": true, + "error": null + } + } + ] +} diff --git a/tests/test_search_perf.py b/tests/test_search_perf.py new file mode 100644 index 0000000..ce42b88 --- /dev/null +++ b/tests/test_search_perf.py @@ -0,0 +1,296 @@ +""" +Search perf-smoke baseline (#167, Track 1a of #165). + +Drives the canonical query set against the deployed Explorer and records +per-search timings + bytes-transferred for each query, both cold (fresh +page context) and warm (immediate repeat on the same page). + +Cold here means file-cold: a fresh browser context, no DuckDB-WASM cache, +no HTTP cache. Warm means a second invocation of the same query on the +same page after the cold run completes. + +Usage: + + # Against the deployed site (default): + pytest tests/test_search_perf.py -s + + # Against a local Quarto preview: + ISAMPLES_BASE_URL=http://localhost:5860 pytest tests/test_search_perf.py -s + + # Against a deployed PR preview: + ISAMPLES_BASE_URL=https:// pytest tests/test_search_perf.py -s + +The benchmark JSON is written to: + + tests/search_baseline_.json + +This is slow (~8-15 minutes total) because each query opens a fresh context +to capture true file-cold timings. Run as a release-time benchmark, not on +every CI invocation. + +Open design questions tracked on issue #167. +""" +import datetime as dt +import json +import os +import pathlib +import pytest +from conftest import SITE_URL + + +EXPLORER_URL = f"{SITE_URL}/explorer.html?perf=1" + +# Canonical query set (locked in #167). Each entry drives one cold + one +# warm measurement. Filter-composition cases set source/facet selections +# before searching. +CANONICAL_QUERIES = [ + {"label": "single-common", "term": "pottery", "filters": {}}, + {"label": "single-rare", "term": "basalt", "filters": {}}, + {"label": "multi-term", "term": "pottery Cyprus", "filters": {}}, + {"label": "no-hit", "term": "xyzzyqqqplugh", "filters": {}}, + {"label": "wildcard-pct", "term": "100%", "filters": {}}, + {"label": "wildcard-under", "term": "_test", "filters": {}}, + {"label": "diacritic", "term": "Çatalhöyük", "filters": {}}, + { + "label": "composed-source", + "term": "pottery", + "filters": {"source_only": ["OPENCONTEXT"]}, + }, + { + # Pairs source restriction with a material-facet selection so the + # benchmark exercises the facetFilterSQL() pid-IN-subquery path, + # not just sourceFilterSQL(). The first material checkbox is used + # to keep the test stable across data refreshes (don't hard-code a + # URI that may disappear between snapshots). + "label": "composed-source-material", + "term": "pottery", + "filters": { + "source_only": ["OPENCONTEXT"], + "material_first_n": 1, + }, + }, +] + + +def _wait_for_explorer_ready(page, timeout_ms: int = 90_000) -> None: + """Wait until DuckDB-WASM has loaded facets — proxy for "ready to search".""" + page.wait_for_function( + """() => { + const el = document.querySelector( + ".facet-count[data-facet='source'][data-value='SESAR']" + ); + return el && /\\(\\d/.test(el.textContent || ''); + }""", + timeout=timeout_ms, + ) + + +def _wait_for_facet_settle(page, timeout_ms: int = 30_000) -> None: + """Block until all change-triggered async work has fully settled. + + Two-phase wait, matching the app's two-stage settle (#173 review): + + 1. body.classList.contains('explorer-busy') is set by the source / + material / context / object_type change handlers around their + async work. We wait for it to clear — guarantees that loadRes, + loadViewportSamples, and the 250 ms refreshFacetCounts debounce + have all fired. + 2. .facet-count.recomputing is set during the actual cross-filter + query and cleared as each dimension's results arrive. We wait + for it to clear — guarantees the in-flight count queries are + done. + + Polling either signal alone races: just-recomputing because the + debounce hasn't fired yet, just-busy because the change handler + chose a different work path. + """ + page.wait_for_function( + """() => !document.body.classList.contains('explorer-busy')""", + timeout=timeout_ms, + ) + page.wait_for_function( + """() => { + const recomputing = document.querySelectorAll('.facet-count.recomputing'); + return recomputing.length === 0; + }""", + timeout=timeout_ms, + ) + + +def _apply_source_filter(page, sources_to_keep_checked: list[str]) -> None: + """Uncheck source checkboxes that aren't in the keep list.""" + all_sources = ["SESAR", "OPENCONTEXT", "GEOME", "SMITHSONIAN"] + changed = False + for src in all_sources: + cb = page.locator(f"#sourceFilter input[type='checkbox'][value='{src}']") + is_checked = cb.is_checked() + should_be_checked = src in sources_to_keep_checked + if is_checked != should_be_checked: + cb.click() + changed = True + if changed: + _wait_for_facet_settle(page) + + +def _apply_material_first_n(page, n: int) -> None: + """Check the first n material-facet checkboxes (avoids hard-coding URIs). + + The material filter section ships with `display: none` on the body + (explorer.qmd:280); the header click handler toggles it. Expand the + section before attempting to click any checkbox inside it. + """ + if n <= 0: + return + boxes = page.locator("#materialFilterBody input[type='checkbox']") + boxes.first.wait_for(state="attached", timeout=15_000) + body_hidden = page.evaluate( + "() => document.getElementById('materialFilterBody').style.display === 'none'" + ) + if body_hidden: + page.locator("#materialFilter .filter-header").click() + total = boxes.count() + for i in range(min(n, total)): + cb = boxes.nth(i) + if not cb.is_checked(): + cb.click() + _wait_for_facet_settle(page) + + +def _run_search(page, term: str, *, captured: list, expected_id_after: int) -> dict: + """Type term, click search, wait for the corresponding console event.""" + search_input = page.locator("#sampleSearch") + search_input.click() + # Clear via select-all + delete (faster + works around platform shortcuts). + search_input.press("ControlOrMeta+a") + search_input.press("Delete") + search_input.fill(term) + page.locator("#searchBtn").click() + + # Wait for an isamples.search log whose id is strictly greater than the + # last one we observed. Polling is simpler than promise-based waits here. + deadline = page.evaluate("() => Date.now()") + 90_000 + while True: + for entry in captured: + if entry.get("id", -1) > expected_id_after and entry.get("term") == term: + return entry + if page.evaluate("() => Date.now()") > deadline: + raise TimeoutError(f"No isamples.search log captured for term={term!r}") + page.wait_for_timeout(250) + + +def _collect_search_logs(page, captured: list) -> None: + """Attach a console listener that parses isamples.search JSON events.""" + def _on_console(msg): + if msg.type != "log": + return + text = msg.text + if "isamples.search" not in text: + return + try: + payload = json.loads(text) + except (json.JSONDecodeError, ValueError): + return + if isinstance(payload, dict) and payload.get("event") == "isamples.search": + captured.append(payload) + page.on("console", _on_console) + + +def _measure_one_query(browser, query: dict) -> dict: + """Open fresh context, run cold + warm, return aggregated record. + + try/finally on context.close() so a timeout in _run_search (or any + intermediate step) does not leak the browser context, which would + skew later measurements (#173 review). + """ + context = browser.new_context(viewport={"width": 1280, "height": 900}) + try: + page = context.new_page() + captured: list = [] + _collect_search_logs(page, captured) + page.goto(EXPLORER_URL, wait_until="domcontentloaded", timeout=60_000) + _wait_for_explorer_ready(page) + + filters = query["filters"] + if "source_only" in filters: + _apply_source_filter(page, filters["source_only"]) + if "material_first_n" in filters: + _apply_material_first_n(page, filters["material_first_n"]) + + cold = _run_search( + page, query["term"], captured=captured, expected_id_after=0 + ) + warm = _run_search( + page, query["term"], captured=captured, expected_id_after=cold["id"] + ) + finally: + context.close() + return { + "label": query["label"], + "term": query["term"], + "filters": query["filters"], + "cold": cold, + "warm": warm, + } + + +def _utc_now() -> dt.datetime: + """Aware UTC datetime; replaces the deprecated dt.datetime.utcnow().""" + return dt.datetime.now(dt.timezone.utc) + + +@pytest.fixture(scope="session") +def benchmark_run_started_at() -> dt.datetime: + return _utc_now() + + +@pytest.fixture(scope="session") +def baseline_output_path(benchmark_run_started_at) -> pathlib.Path: + stamp = benchmark_run_started_at.strftime("%Y-%m-%d") + path = pathlib.Path(__file__).parent / f"search_baseline_{stamp}.json" + return path + + +def test_record_search_baseline(browser, benchmark_run_started_at, baseline_output_path): + """Run the canonical query set, dump JSON. Single test = one benchmark run.""" + results = [] + failures = [] + for query in CANONICAL_QUERIES: + try: + record = _measure_one_query(browser, query) + except Exception as exc: + record = { + "label": query["label"], + "term": query["term"], + "filters": query["filters"], + "error": f"{type(exc).__name__}: {exc}", + } + failures.append(record) + results.append(record) + # Stream to stdout so partial runs are still useful. + print(json.dumps(record, indent=2)) + + payload = { + "site_url": SITE_URL, + "captured_at_utc": benchmark_run_started_at.isoformat(), + "schema_version": 1, + "field_subset": "label+place_name (samples_map_lite.parquet)", + "queries": results, + } + baseline_output_path.write_text(json.dumps(payload, indent=2) + "\n") + print(f"\nWrote baseline to {baseline_output_path}") + + # A benchmark with silent failures is a poisoned baseline — refuse to + # treat it as valid. Partial data is still on disk for diagnosis. + incomplete = [ + r for r in results + if "error" in r + or "cold" not in r + or r.get("cold", {}).get("elapsed_ms") is None + or "warm" not in r + or r.get("warm", {}).get("elapsed_ms") is None + ] + assert not incomplete, ( + f"{len(incomplete)}/{len(CANONICAL_QUERIES)} queries did not complete cleanly. " + f"Failed labels: {[r['label'] for r in incomplete]}. " + f"Partial data at {baseline_output_path}." + )