From d254a1ec15262fd0168881e251be18936c911fe2 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Sun, 24 May 2026 18:38:25 +0300 Subject: [PATCH 1/5] feat: add grants-explorer CLI for natural-language SQL queries over paatokset.xlsx --- package.json | 8 +- pnpm-lock.yaml | 72 +++++ src/cli/grants-explorer/README.md | 61 +++++ .../grants-explorer/clients/database.test.ts | 97 +++++++ src/cli/grants-explorer/clients/database.ts | 114 ++++++++ .../clients/xlsx-loader.test.ts | 107 ++++++++ .../grants-explorer/clients/xlsx-loader.ts | 249 ++++++++++++++++++ src/cli/grants-explorer/constants.ts | 3 + src/cli/grants-explorer/main.ts | 115 ++++++++ src/cli/grants-explorer/tools/sql-tool.ts | 88 +++++++ src/cli/grants-explorer/types/schemas.ts | 40 +++ 11 files changed, 951 insertions(+), 3 deletions(-) create mode 100644 src/cli/grants-explorer/README.md create mode 100644 src/cli/grants-explorer/clients/database.test.ts create mode 100644 src/cli/grants-explorer/clients/database.ts create mode 100644 src/cli/grants-explorer/clients/xlsx-loader.test.ts create mode 100644 src/cli/grants-explorer/clients/xlsx-loader.ts create mode 100644 src/cli/grants-explorer/constants.ts create mode 100644 src/cli/grants-explorer/main.ts create mode 100644 src/cli/grants-explorer/tools/sql-tool.ts create mode 100644 src/cli/grants-explorer/types/schemas.ts diff --git a/package.json b/package.json index 1f660f4..2c9432b 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,7 @@ "run:owner-lookup": "tsx src/cli/owner-lookup/main.ts", "run:update-docs": "tsx src/cli/update-docs/main.ts", "run:resolve-pr-comments": "tsx src/cli/resolve-pr-comments/main.ts", + "run:grants-explorer": "pnpm -s node:tsx -- src/cli/grants-explorer/main.ts", "node:tsx": "node --disable-warning=ExperimentalWarning --import tsx", "typecheck": "tsc --noEmit", "lint": "eslint .", @@ -58,13 +59,14 @@ "prettier": "3.7.4", "sanitize-html": "2.17.0", "slug": "11.0.1", + "stream-json": "2.0.0", "tsx": "4.21.0", "typescript": "5.9.3", "typescript-eslint": "8.52.0", + "unzipper": "0.12.3", "vitest": "4.0.16", + "xlsx": "0.18.5", "zod": "4.3.5", - "zx": "8.8.5", - "stream-json": "2.0.0", - "unzipper": "0.12.3" + "zx": "8.8.5" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c801941..87e336e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -92,6 +92,9 @@ importers: vitest: specifier: 4.0.16 version: 4.0.16(@types/node@25.0.6)(jiti@2.6.1)(jsdom@27.4.0)(tsx@4.21.0) + xlsx: + specifier: 0.18.5 + version: 0.18.5 zod: specifier: 4.3.5 version: 4.3.5 @@ -759,6 +762,10 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + adler-32@1.3.1: + resolution: {integrity: sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==} + engines: {node: '>=0.8'} + agent-base@7.1.4: resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} engines: {node: '>= 14'} @@ -862,6 +869,10 @@ packages: resolution: {integrity: sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==} engines: {node: '>=6'} + cfb@1.2.2: + resolution: {integrity: sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==} + engines: {node: '>=0.8'} + chai@6.2.2: resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==} engines: {node: '>=18'} @@ -870,6 +881,10 @@ packages: resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} engines: {node: '>=10'} + codepage@1.15.0: + resolution: {integrity: sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==} + engines: {node: '>=0.8'} + color-convert@2.0.1: resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==} engines: {node: '>=7.0.0'} @@ -903,6 +918,11 @@ packages: resolution: {integrity: sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==} engines: {node: '>= 0.10'} + crc-32@1.2.2: + resolution: {integrity: sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==} + engines: {node: '>=0.8'} + hasBin: true + cross-spawn@7.0.6: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} @@ -1216,6 +1236,10 @@ packages: resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==} engines: {node: '>= 0.6'} + frac@1.1.2: + resolution: {integrity: sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==} + engines: {node: '>=0.8'} + fresh@2.0.0: resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==} engines: {node: '>= 0.8'} @@ -1911,6 +1935,10 @@ packages: resolution: {integrity: sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==} engines: {node: '>=0.10.0'} + ssf@0.11.2: + resolution: {integrity: sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==} + engines: {node: '>=0.8'} + stackback@0.0.2: resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==} @@ -2197,10 +2225,18 @@ packages: engines: {node: '>=8'} hasBin: true + wmf@1.0.2: + resolution: {integrity: sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==} + engines: {node: '>=0.8'} + word-wrap@1.2.5: resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==} engines: {node: '>=0.10.0'} + word@0.3.0: + resolution: {integrity: sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==} + engines: {node: '>=0.8'} + wrappy@1.0.2: resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} @@ -2216,6 +2252,11 @@ packages: utf-8-validate: optional: true + xlsx@0.18.5: + resolution: {integrity: sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==} + engines: {node: '>=0.8'} + hasBin: true + xml-name-validator@5.0.0: resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==} engines: {node: '>=18'} @@ -2852,6 +2893,8 @@ snapshots: acorn@8.15.0: {} + adler-32@1.3.1: {} + agent-base@7.1.4: {} ajv-formats@3.0.1(ajv@8.17.1): @@ -2994,6 +3037,11 @@ snapshots: callsites@3.1.0: {} + cfb@1.2.2: + dependencies: + adler-32: 1.3.1 + crc-32: 1.2.2 + chai@6.2.2: {} chalk@4.1.2: @@ -3001,6 +3049,8 @@ snapshots: ansi-styles: 4.3.0 supports-color: 7.2.0 + codepage@1.15.0: {} + color-convert@2.0.1: dependencies: color-name: 1.1.4 @@ -3029,6 +3079,8 @@ snapshots: vary: 1.1.2 optional: true + crc-32@1.2.2: {} + cross-spawn@7.0.6: dependencies: path-key: 3.1.1 @@ -3490,6 +3542,8 @@ snapshots: forwarded@0.2.0: optional: true + frac@1.1.2: {} + fresh@2.0.0: optional: true @@ -4286,6 +4340,10 @@ snapshots: source-map-js@1.2.1: {} + ssf@0.11.2: + dependencies: + frac: 1.1.2 + stackback@0.0.2: {} statuses@2.0.2: @@ -4594,13 +4652,27 @@ snapshots: siginfo: 2.0.0 stackback: 0.0.2 + wmf@1.0.2: {} + word-wrap@1.2.5: {} + word@0.3.0: {} + wrappy@1.0.2: optional: true ws@8.19.0: {} + xlsx@0.18.5: + dependencies: + adler-32: 1.3.1 + cfb: 1.2.2 + codepage: 1.15.0 + crc-32: 1.2.2 + ssf: 0.11.2 + wmf: 1.0.2 + word: 0.3.0 + xml-name-validator@5.0.0: {} xmlchars@2.2.0: {} diff --git a/src/cli/grants-explorer/README.md b/src/cli/grants-explorer/README.md new file mode 100644 index 0000000..57cb47b --- /dev/null +++ b/src/cli/grants-explorer/README.md @@ -0,0 +1,61 @@ +# Grants Explorer + +Loads the Finnish grant-decisions workbook at `tmp/paatokset.xlsx` into an in-memory SQLite database and answers natural-language questions via an OpenAI agent that has a read-only `query_grants` SQL tool. + +## Run + +``` +pnpm run:grants-explorer +pnpm run:grants-explorer --file=tmp/paatokset.xlsx +``` + +## Arguments + +- `--file` (optional): path to the xlsx workbook. Defaults to `tmp/paatokset.xlsx`. + +## Table schema + +| Column | Type | Source header | +| -------------------- | ------- | ---------------------------- | +| `decision_date` | TEXT | Päätös pvm (ISO date) | +| `recipient` | TEXT | Saajan nimi (incl. y-tunnus) | +| `granting_authority` | TEXT | Myöntäjä | +| `case_number` | TEXT | Asianumero | +| `amount_applied` | INTEGER | Haettu (EUR, nullable) | +| `amount_granted` | INTEGER | Myönnetty (EUR, nullable) | +| `has_eu_funding` | INTEGER | EU-varat (0/1) | +| `purpose` | TEXT | Hyväksytty käyttötarkoitus | +| `programme` | TEXT | Haun nimi (asianumero) | +| `region` | TEXT | Alueet | + +`amount_applied` / `amount_granted` are nullable so an unknown amount stays distinguishable from a real `0 €` decision in aggregates. + +## Example session + +``` +$ pnpm run:grants-explorer +Ask about Finnish grant decisions: How much has Lapin ELY-keskus granted in total? +[ANSWER] Lapin ELY-keskus has granted approximately X € across N decisions. +``` + +## Flowchart + +```mermaid +flowchart TD + A["Start"] --> B["Parse --file"] + B --> C["XlsxLoader.load() → GrantRow[]"] + C --> D["GrantsDatabase :memory: INSERT"] + D --> E["AgentRunner with query_grants tool"] + E --> F{"User question?"} + F -->|"yes"| G["Agent runs SQL via tool"] + G --> H{"status"} + H -->|"final"| I["Print answer"] + H -->|"needs_clarification"| F + F -->|"empty"| J["Done"] + I --> J +``` + +## Notes + +- `xlsx` (SheetJS) is used because the source workbook omits the optional cell `r` (reference) attribute and uses an unusual `x:` element-namespace prefix; `read-excel-file` and `exceljs` both rejected this layout in testing. +- `paatos_pvm` cells arrive as raw Excel serial numbers (date styling without the `t="d"` cell type), so the loader explicitly converts via `XLSX.SSF.parse_date_code`. diff --git a/src/cli/grants-explorer/clients/database.test.ts b/src/cli/grants-explorer/clients/database.test.ts new file mode 100644 index 0000000..2d86e31 --- /dev/null +++ b/src/cli/grants-explorer/clients/database.test.ts @@ -0,0 +1,97 @@ +import { Logger } from "~clients/logger"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import type { GrantRow } from "../types/schemas"; +import { GrantsDatabase } from "./database"; + +const silentLogger = new Logger({ + level: "error", + useColors: false, + useTimestamps: false, +}); + +const row = (overrides: Partial = {}): GrantRow => ({ + decision_date: "2026-01-15", + recipient: "Test ry", + granting_authority: "Lapin ELY-keskus", + case_number: "001", + amount_applied: 1000, + amount_granted: 800, + has_eu_funding: 0, + purpose: "Test purpose", + programme: "Test programme", + region: "Test region", + ...overrides, +}); + +describe("GrantsDatabase", () => { + let db: GrantsDatabase; + + beforeEach(() => { + db = new GrantsDatabase(silentLogger); + }); + + afterEach(() => { + db.close(); + }); + + it("inserts rows and counts them", () => { + db.insertRows([row(), row(), row()]); + expect(db.getTotalCount()).toBe(3); + }); + + it("SUM(amount_granted) treats null amounts as null (not 0)", () => { + db.insertRows([ + row({ granting_authority: "Authority A", amount_granted: 100 }), + row({ granting_authority: "Authority A", amount_granted: 200 }), + row({ granting_authority: "Authority A", amount_granted: null }), + row({ granting_authority: "Authority B", amount_granted: 500 }), + ]); + + // Authority A: SUM ignores the null row → 300, not 0+100+200. + const a = db.queryOne<{ s: number | null }>( + "SELECT SUM(amount_granted) as s FROM grants WHERE granting_authority = ?", + ["Authority A"] + ); + expect(a?.s).toBe(300); + + // Same authority, COUNT(amount_granted) excludes the null; COUNT(*) includes it. + const c = db.queryOne<{ total: number; non_null: number }>( + "SELECT COUNT(*) as total, COUNT(amount_granted) as non_null FROM grants WHERE granting_authority = ?", + ["Authority A"] + ); + expect(c?.total).toBe(3); + expect(c?.non_null).toBe(2); + }); + + it("filters by has_eu_funding correctly", () => { + db.insertRows([ + row({ has_eu_funding: 1 }), + row({ has_eu_funding: 0 }), + row({ has_eu_funding: 0 }), + ]); + const result = db.queryOne<{ n: number }>( + "SELECT COUNT(*) as n FROM grants WHERE has_eu_funding = 1" + ); + expect(result?.n).toBe(1); + }); + + it("supports LIKE search across recipient (incl. y-tunnus)", () => { + db.insertRows([ + row({ recipient: "Lapin Martat ry (0210606-0)" }), + row({ recipient: "Rikala-seura ry (2477520-6)" }), + ]); + const result = db.query<{ recipient: string }>( + "SELECT recipient FROM grants WHERE recipient LIKE ?", + ["%0210606-0%"] + ); + expect(result).toHaveLength(1); + expect(result[0]?.recipient).toBe("Lapin Martat ry (0210606-0)"); + }); + + it("CHECK constraint rejects out-of-range has_eu_funding", () => { + expect(() => { + db.insertRows([row({ has_eu_funding: 2 as unknown as 0 | 1 })]); + }).toThrow(); + }); +}); diff --git a/src/cli/grants-explorer/clients/database.ts b/src/cli/grants-explorer/clients/database.ts new file mode 100644 index 0000000..a9d97b2 --- /dev/null +++ b/src/cli/grants-explorer/clients/database.ts @@ -0,0 +1,114 @@ +import { DatabaseSync } from "node:sqlite"; +import type { SQLInputValue } from "node:sqlite"; +import type { Logger } from "~clients/logger"; + +import type { GrantRow } from "../types/schemas"; + +/** + * In-memory SQLite for the `grants` table populated from paatokset.xlsx. + * + * Column names anglicized for easier LLM use; the SQL tool description + * documents the original Finnish header mapping for the agent. + */ +export class GrantsDatabase { + private db: DatabaseSync; + private logger: Logger; + + constructor(logger: Logger) { + this.logger = logger; + this.db = new DatabaseSync(":memory:"); + this.createSchema(); + } + + private createSchema(): void { + this.db.exec(` + CREATE TABLE grants ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + decision_date TEXT, + recipient TEXT, + granting_authority TEXT, + case_number TEXT, + amount_applied INTEGER, + amount_granted INTEGER, + has_eu_funding INTEGER NOT NULL CHECK (has_eu_funding IN (0, 1)), + purpose TEXT, + programme TEXT, + region TEXT + ); + + CREATE INDEX idx_grants_granting_authority ON grants(granting_authority); + CREATE INDEX idx_grants_region ON grants(region); + CREATE INDEX idx_grants_decision_date ON grants(decision_date); + CREATE INDEX idx_grants_has_eu_funding ON grants(has_eu_funding); + `); + this.logger.debug("Grants schema created"); + } + + insertRows(rows: GrantRow[]): void { + const insert = this.db.prepare(` + INSERT INTO grants ( + decision_date, recipient, granting_authority, case_number, + amount_applied, amount_granted, has_eu_funding, + purpose, programme, region + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `); + + this.db.exec("BEGIN"); + try { + for (const row of rows) { + insert.run( + row.decision_date, + row.recipient, + row.granting_authority, + row.case_number, + row.amount_applied, + row.amount_granted, + row.has_eu_funding, + row.purpose, + row.programme, + row.region + ); + } + this.db.exec("COMMIT"); + } catch (error) { + this.db.exec("ROLLBACK"); + throw error; + } + this.logger.debug("Inserted grants", { count: rows.length }); + } + + query( + sql: string, + params: SQLInputValue[] = [], + mapRow?: (row: unknown) => T + ): T[] { + const stmt = this.db.prepare(sql); + const rows = stmt.all(...params) as unknown[]; + return mapRow ? rows.map(mapRow) : (rows as T[]); + } + + queryOne( + sql: string, + params: SQLInputValue[] = [], + mapRow?: (row: unknown) => T + ): T | undefined { + const stmt = this.db.prepare(sql); + const row = stmt.get(...params) as unknown; + if (row === undefined) { + return undefined; + } + return mapRow ? mapRow(row) : (row as T); + } + + getTotalCount(): number { + const result = this.db + .prepare("SELECT COUNT(*) as count FROM grants") + .get() as { count: number }; + return result.count; + } + + close(): void { + this.logger.debug("Closing grants database"); + this.db.close(); + } +} diff --git a/src/cli/grants-explorer/clients/xlsx-loader.test.ts b/src/cli/grants-explorer/clients/xlsx-loader.test.ts new file mode 100644 index 0000000..8c7935a --- /dev/null +++ b/src/cli/grants-explorer/clients/xlsx-loader.test.ts @@ -0,0 +1,107 @@ +import { describe, expect, it } from "vitest"; +import XLSX from "xlsx"; + +import { + normalizeAmount, + normalizeEuFunding, + normalizeExcelDate, + normalizeText, +} from "./xlsx-loader"; + +type ParsedDateCode = { y: number; m: number; d: number }; +const ssf = XLSX.SSF as { + parse_date_code: (n: number) => ParsedDateCode | undefined; +}; + +describe("normalizeExcelDate", () => { + it("converts numeric Excel serials via XLSX.SSF.parse_date_code", () => { + // Compute the expected ISO date from the library itself, so a wrong + // hard-coded constant in this test can't mask a real bug in the helper. + const serial = 46022; + const parsed = ssf.parse_date_code(serial); + if (!parsed) { + throw new Error("XLSX.SSF.parse_date_code returned undefined"); + } + const pad = (n: number) => String(n).padStart(2, "0"); + expect(normalizeExcelDate(serial)).toBe( + `${parsed.y}-${pad(parsed.m)}-${pad(parsed.d)}` + ); + }); + + it("converts JS Date objects to YYYY-MM-DD", () => { + expect(normalizeExcelDate(new Date("2026-01-15T12:34:56.000Z"))).toBe( + "2026-01-15" + ); + }); + + it("returns trimmed pre-formatted strings unchanged", () => { + expect(normalizeExcelDate("2026-01-15")).toBe("2026-01-15"); + expect(normalizeExcelDate(" 2026-01-15 ")).toBe("2026-01-15"); + }); + + it("returns null for null, undefined, empty/whitespace, NaN, invalid Date", () => { + expect(normalizeExcelDate(null)).toBeNull(); + expect(normalizeExcelDate(undefined)).toBeNull(); + expect(normalizeExcelDate("")).toBeNull(); + expect(normalizeExcelDate(" ")).toBeNull(); + expect(normalizeExcelDate(Number.NaN)).toBeNull(); + expect(normalizeExcelDate(new Date("not-a-date"))).toBeNull(); + }); +}); + +describe("normalizeAmount", () => { + it("passes through finite numbers (rounded to integer)", () => { + expect(normalizeAmount(49855)).toBe(49855); + expect(normalizeAmount(0)).toBe(0); + expect(normalizeAmount(49855.4)).toBe(49855); + expect(normalizeAmount(49855.6)).toBe(49856); + }); + + it("parses numeric strings", () => { + expect(normalizeAmount("49855")).toBe(49855); + expect(normalizeAmount(" 49855 ")).toBe(49855); + }); + + it("returns null for null/undefined/empty/NaN/non-numeric (NOT 0, NOT skip)", () => { + expect(normalizeAmount(null)).toBeNull(); + expect(normalizeAmount(undefined)).toBeNull(); + expect(normalizeAmount("")).toBeNull(); + expect(normalizeAmount(" ")).toBeNull(); + expect(normalizeAmount(Number.NaN)).toBeNull(); + expect(normalizeAmount(Number.POSITIVE_INFINITY)).toBeNull(); + expect(normalizeAmount("abc")).toBeNull(); + expect(normalizeAmount({})).toBeNull(); + }); +}); + +describe("normalizeEuFunding", () => { + it("returns 1 for any non-empty trimmed string", () => { + expect( + normalizeEuFunding( + "https://upload.wikimedia.org/wikipedia/commons/b/b7/Flag_of_Europe.svg" + ) + ).toBe(1); + expect(normalizeEuFunding("x")).toBe(1); + }); + + it("returns 0 for null/undefined/empty/whitespace", () => { + expect(normalizeEuFunding(null)).toBe(0); + expect(normalizeEuFunding(undefined)).toBe(0); + expect(normalizeEuFunding("")).toBe(0); + expect(normalizeEuFunding(" ")).toBe(0); + }); +}); + +describe("normalizeText", () => { + it("trims and preserves non-empty strings", () => { + expect(normalizeText("Lapin Martat ry")).toBe("Lapin Martat ry"); + expect(normalizeText(" Salo ")).toBe("Salo"); + }); + + it("returns null for null/undefined/empty/whitespace", () => { + expect(normalizeText(null)).toBeNull(); + expect(normalizeText(undefined)).toBeNull(); + expect(normalizeText("")).toBeNull(); + expect(normalizeText(" ")).toBeNull(); + }); +}); diff --git a/src/cli/grants-explorer/clients/xlsx-loader.ts b/src/cli/grants-explorer/clients/xlsx-loader.ts new file mode 100644 index 0000000..e49e7af --- /dev/null +++ b/src/cli/grants-explorer/clients/xlsx-loader.ts @@ -0,0 +1,249 @@ +import type { Logger } from "~clients/logger"; +import XLSX from "xlsx"; + +import type { GrantRow } from "../types/schemas"; +import { GrantRowSchema } from "../types/schemas"; + +type RawCell = string | number | boolean | Date | null; +type RawRow = RawCell[]; + +// The xlsx package declares `SSF` as `any`; this typed wrapper avoids +// no-unsafe-call/member-access lint errors at every call site. +type ParsedDateCode = { y: number; m: number; d: number }; +const parseExcelSerialDate = (serial: number): ParsedDateCode | undefined => { + const ssf = XLSX.SSF as { + parse_date_code: (n: number) => ParsedDateCode | undefined; + }; + return ssf.parse_date_code(serial); +}; + +// Finnish column name (source xlsx header) → English GrantRow field. +// Order is irrelevant; we map by header text. +const HEADER_TO_FIELD: Record = { + "Päätös pvm": "decision_date", + "Saajan nimi": "recipient", + Myöntäjä: "granting_authority", + Asianumero: "case_number", + Haettu: "amount_applied", + Myönnetty: "amount_granted", + "EU-varat": "has_eu_funding", + "Hyväksytty käyttötarkoitus": "purpose", + "Haun nimi (asianumero)": "programme", + Alueet: "region", +}; + +/** + * Normalize an Excel date-styled cell into ISO `YYYY-MM-DD`. + * + * `paatokset.xlsx` stores `Päätös pvm` as a raw Excel serial number (e.g. 46022) + * with date styling, so `XLSX.read({ cellDates: true })` does NOT convert it to + * a Date. We accept all three shapes for safety: Date (other producers), number + * (this file), and pre-formatted string. + */ +export const normalizeExcelDate = (value: unknown): string | null => { + if (value === null || value === undefined) { + return null; + } + if (value instanceof Date) { + if (Number.isNaN(value.getTime())) { + return null; + } + return value.toISOString().slice(0, 10); + } + if (typeof value === "number" && Number.isFinite(value)) { + const parsed = parseExcelSerialDate(value); + if (!parsed) { + return null; + } + const pad = (n: number) => String(n).padStart(2, "0"); + return `${parsed.y}-${pad(parsed.m)}-${pad(parsed.d)}`; + } + if (typeof value === "string") { + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : null; + } + return null; +}; + +/** + * Coerce a cell to an integer EUR amount, or `null` when missing/invalid. + * + * Returns `null` (NOT `0`, NOT row-skip) for invalid input. A missing or + * malformed grant amount must remain distinguishable from a real `0 €` + * decision in SUM/AVG aggregates. + */ +export const normalizeAmount = (value: unknown): number | null => { + if (value === null || value === undefined) { + return null; + } + if (typeof value === "number") { + return Number.isFinite(value) ? Math.round(value) : null; + } + if (typeof value === "string") { + const trimmed = value.trim(); + if (trimmed.length === 0) { + return null; + } + const n = Number(trimmed); + return Number.isFinite(n) ? Math.round(n) : null; + } + return null; +}; + +/** + * In `paatokset.xlsx`, EU-funded rows hold a Wikipedia flag-image URL in the + * EU-varat cell; unfunded rows are empty. Any non-empty trimmed string → 1. + * Non-null primitive cells (number / boolean / Date) are also treated as + * "present", since for this column any non-empty value indicates funding. + */ +export const normalizeEuFunding = (value: unknown): 0 | 1 => { + if (value === null || value === undefined) { + return 0; + } + if (typeof value === "string") { + return value.trim().length > 0 ? 1 : 0; + } + return 1; +}; + +export const normalizeText = (value: unknown): string | null => { + if (value === null || value === undefined) { + return null; + } + if (typeof value === "string") { + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : null; + } + if (typeof value === "number" || typeof value === "boolean") { + return String(value); + } + return null; +}; + +export type XlsxLoaderOptions = { + logger: Logger; +}; + +/** + * Loads `paatokset.xlsx` into typed GrantRow records. + * + * Header mapping is by exact Finnish column name (see HEADER_TO_FIELD); column + * order in the workbook does not matter. Unknown headers are ignored. + */ +export class XlsxLoader { + private logger: Logger; + + constructor({ logger }: XlsxLoaderOptions) { + this.logger = logger; + } + + load(filePath: string): GrantRow[] { + this.logger.debug("Reading xlsx file", { filePath }); + const workbook = XLSX.readFile(filePath, { cellDates: true }); + const sheetName = workbook.SheetNames[0]; + if (!sheetName) { + throw new Error(`No sheets found in ${filePath}`); + } + const sheet = workbook.Sheets[sheetName]; + if (!sheet) { + throw new Error(`Sheet "${sheetName}" missing in ${filePath}`); + } + const rawRows = XLSX.utils.sheet_to_json(sheet, { + header: 1, + raw: true, + defval: null, + }); + + if (rawRows.length === 0) { + throw new Error(`No rows in sheet "${sheetName}" of ${filePath}`); + } + + const headerRow = rawRows[0]; + if (!headerRow) { + throw new Error(`Empty header row in ${filePath}`); + } + const columnIndex: Partial> = {}; + headerRow.forEach((cell, idx) => { + const headerText = typeof cell === "string" ? cell.trim() : ""; + const field = HEADER_TO_FIELD[headerText]; + if (field) { + columnIndex[field] = idx; + } + }); + + const missing = Object.entries(HEADER_TO_FIELD) + .filter(([, field]) => columnIndex[field] === undefined) + .map(([header]) => header); + if (missing.length > 0) { + throw new Error( + `Missing expected columns in ${filePath}: ${missing.join(", ")}` + ); + } + + const rows: GrantRow[] = []; + let dateNormalizationFailures = 0; + let validationFailures = 0; + for (let i = 1; i < rawRows.length; i++) { + const raw = rawRows[i]; + if (!raw) { + continue; + } + + const at = (field: keyof GrantRow): unknown => { + const idx = columnIndex[field]; + return idx === undefined ? null : (raw[idx] ?? null); + }; + + const decisionDate = normalizeExcelDate(at("decision_date")); + if (decisionDate === null && at("decision_date") !== null) { + dateNormalizationFailures++; + this.logger.warn("Failed to normalize Päätös pvm", { + rowIndex: i, + raw: at("decision_date"), + }); + } + + const candidate = { + decision_date: decisionDate, + recipient: normalizeText(at("recipient")), + granting_authority: normalizeText(at("granting_authority")), + case_number: normalizeText(at("case_number")), + amount_applied: normalizeAmount(at("amount_applied")), + amount_granted: normalizeAmount(at("amount_granted")), + has_eu_funding: normalizeEuFunding(at("has_eu_funding")), + purpose: normalizeText(at("purpose")), + programme: normalizeText(at("programme")), + region: normalizeText(at("region")), + }; + + const parsed = GrantRowSchema.safeParse(candidate); + if (!parsed.success) { + validationFailures++; + this.logger.warn("Row failed GrantRowSchema validation; skipping", { + rowIndex: i, + issues: parsed.error.issues, + }); + continue; + } + rows.push(parsed.data); + } + + if (validationFailures > 0) { + this.logger.warn( + "XlsxLoader produced rows that failed schema validation", + { + validationFailures, + totalRawRows: rawRows.length - 1, + } + ); + } + + this.logger.info("Loaded xlsx rows", { + filePath, + rowCount: rows.length, + dateNormalizationFailures, + validationFailures, + }); + return rows; + } +} diff --git a/src/cli/grants-explorer/constants.ts b/src/cli/grants-explorer/constants.ts new file mode 100644 index 0000000..8f622cd --- /dev/null +++ b/src/cli/grants-explorer/constants.ts @@ -0,0 +1,3 @@ +export const AGENT_NAME = "GrantsExplorerAgent"; +export const AGENT_MODEL = "gpt-5-mini"; +export const DEFAULT_XLSX_PATH = "tmp/paatokset.xlsx"; diff --git a/src/cli/grants-explorer/main.ts b/src/cli/grants-explorer/main.ts new file mode 100644 index 0000000..1ac8887 --- /dev/null +++ b/src/cli/grants-explorer/main.ts @@ -0,0 +1,115 @@ +// pnpm run:grants-explorer +// pnpm run:grants-explorer --file=tmp/paatokset.xlsx + +import "dotenv/config"; + +import { AgentRunner } from "~clients/agent-runner"; +import { Logger } from "~clients/logger"; +import { parseArgs } from "~utils/parse-args"; +import { QuestionHandler } from "~utils/question-handler"; + +import { GrantsDatabase } from "./clients/database"; +import { XlsxLoader } from "./clients/xlsx-loader"; +import { AGENT_MODEL, AGENT_NAME, DEFAULT_XLSX_PATH } from "./constants"; +import { createSqlQueryTool } from "./tools/sql-tool"; +import { + CliArgsSchema, + GrantsAgentOutputSchema, + GrantsAgentOutputTypeSchema, +} from "./types/schemas"; + +const logger = new Logger(); + +let db: GrantsDatabase | null = null; + +try { + const { file } = parseArgs({ logger, schema: CliArgsSchema }); + const xlsxPath = file ?? DEFAULT_XLSX_PATH; + + const rows = new XlsxLoader({ logger }).load(xlsxPath); + + db = new GrantsDatabase(logger); + db.insertRows(rows); + logger.info("Grants loaded into in-memory SQL", { + rows: db.getTotalCount(), + }); + + const agentRunner = new AgentRunner({ + name: AGENT_NAME, + model: AGENT_MODEL, + tools: [createSqlQueryTool(db)], + outputType: GrantsAgentOutputTypeSchema, + instructions: `You are an analyst for a Finnish grant-decisions dataset, loaded into an in-memory SQLite database. + +Use the \`query_grants\` tool to run SQL SELECT queries against the \`grants\` table. + +Schema (English column | source Finnish header): +- decision_date (Päätös pvm) ISO date 'YYYY-MM-DD', may be NULL +- recipient (Saajan nimi) incl. y-tunnus in parens, e.g. "Lapin Martat ry (0210606-0)" +- granting_authority (Myöntäjä) e.g. "Lapin ELY-keskus" +- case_number (Asianumero) TEXT +- amount_applied (Haettu) EUR, may be NULL +- amount_granted (Myönnetty) EUR, may be NULL +- has_eu_funding (EU-varat) 0 or 1 +- purpose (Hyväksytty käyttötarkoitus) +- programme (Haun nimi (asianumero)) +- region (Alueet) region / municipality + +Notes: +- Amounts and dates can be NULL; SUM/AVG handle that correctly. +- For y-tunnus searches use LIKE '%%' on recipient. +- Answer in the language of the user's question (Finnish or English). +- Be concise and grounded in the SQL results; don't invent numbers. + +IMPORTANT: Respond with ONLY a valid JSON object: +{"response":{"status":"final"|"needs_clarification","content":"..."}} + +- "final": answer in "content". +- "needs_clarification": one concise follow-up question in "content". +No markdown, no extra keys.`, + logger, + logToolArgs: true, + stateless: true, + }); + + const questionHandler = new QuestionHandler({ logger }); + const userQuestion = await questionHandler.askString({ + prompt: "Ask about Finnish grant decisions: ", + }); + if (!userQuestion.trim()) { + // Empty input → exit quietly without invoking the model. + db.close(); + db = null; + process.exit(0); + } + + let currentQuestion = userQuestion; + while (true) { + const result = await agentRunner.run({ prompt: currentQuestion }); + const parseResult = GrantsAgentOutputSchema.safeParse(result.finalOutput); + if (!parseResult.success) { + logger.warn("Invalid agent response format."); + break; + } + + const output = parseResult.data.response; + if (output.status === "needs_clarification") { + currentQuestion = await questionHandler.askString({ + prompt: output.content, + allowEmpty: true, + }); + if (!currentQuestion.trim()) { + break; + } + continue; + } + + logger.answer(output.content); + break; + } +} catch (error) { + logger.error("Fatal error", { error }); + process.exitCode = 1; +} finally { + db?.close(); +} diff --git a/src/cli/grants-explorer/tools/sql-tool.ts b/src/cli/grants-explorer/tools/sql-tool.ts new file mode 100644 index 0000000..01855e9 --- /dev/null +++ b/src/cli/grants-explorer/tools/sql-tool.ts @@ -0,0 +1,88 @@ +import { tool } from "@openai/agents"; +import { z } from "zod"; + +import type { GrantsDatabase } from "../clients/database"; + +const DANGEROUS_KEYWORDS = [ + "DROP", + "DELETE", + "INSERT", + "UPDATE", + "ALTER", + "CREATE", + "TRUNCATE", + "EXEC", + "EXECUTE", + "ATTACH", + "DETACH", +]; + +// Defense-in-depth: the database is in-memory, but rejecting non-SELECT +// statements keeps a misbehaving model from corrupting the working dataset +// mid-conversation. +const validateReadOnlyQuery = ( + sql: string +): { valid: true } | { valid: false; error: string } => { + const trimmed = sql.trim(); + if (!trimmed.toUpperCase().startsWith("SELECT")) { + return { valid: false, error: "Only SELECT queries are allowed" }; + } + if (sql.includes(";")) { + return { valid: false, error: "Multiple statements are not allowed" }; + } + for (const keyword of DANGEROUS_KEYWORDS) { + const regex = new RegExp(`\\b${keyword}\\b`, "i"); + if (regex.test(sql)) { + return { valid: false, error: `Forbidden keyword: ${keyword}` }; + } + } + return { valid: true }; +}; + +export const createSqlQueryTool = (db: GrantsDatabase) => + tool({ + name: "query_grants", + description: `Execute a read-only SQL SELECT against the in-memory Finnish grant-decisions database. + +Table: grants +Columns (English name | source Finnish header | type): +- id | (auto) | INTEGER PRIMARY KEY +- decision_date | Päätös pvm | TEXT, ISO date 'YYYY-MM-DD', may be NULL +- recipient | Saajan nimi | TEXT, includes y-tunnus in parentheses (e.g. "Lapin Martat ry (0210606-0)") +- granting_authority | Myöntäjä | TEXT, e.g. "Lapin ELY-keskus" +- case_number | Asianumero | TEXT +- amount_applied | Haettu | INTEGER, EUR, may be NULL +- amount_granted | Myönnetty | INTEGER, EUR, may be NULL +- has_eu_funding | EU-varat | INTEGER (0 or 1), 1 = EU funding present +- purpose | Hyväksytty käyttötarkoitus | TEXT, approved purpose +- programme | Haun nimi (asianumero) | TEXT, funding programme incl. programme key +- region | Alueet | TEXT, region / municipality + +Rules: +- Only one SELECT statement; no semicolons, no DDL/DML keywords. +- Amounts and dates can be NULL; use IS NULL / IS NOT NULL where it matters. +- Use LIKE for partial text matches (e.g. y-tunnus inside recipient). + +Example queries: +- Total granted per authority: + SELECT granting_authority, SUM(amount_granted) AS total FROM grants GROUP BY granting_authority ORDER BY total DESC LIMIT 10 +- Top 5 single grants: + SELECT decision_date, recipient, amount_granted FROM grants ORDER BY amount_granted DESC LIMIT 5 +- EU-funded vs. not: + SELECT has_eu_funding, COUNT(*) AS n, SUM(amount_granted) AS sum_eur FROM grants GROUP BY has_eu_funding`, + parameters: z.object({ + sql: z.string().describe("A single SQL SELECT query"), + }), + execute: ({ sql }: { sql: string }) => { + const validation = validateReadOnlyQuery(sql); + if (!validation.valid) { + return { error: validation.error }; + } + try { + const results = db.query(sql); + return { results }; + } catch (error) { + return { error: String(error) }; + } + }, + }); diff --git a/src/cli/grants-explorer/types/schemas.ts b/src/cli/grants-explorer/types/schemas.ts new file mode 100644 index 0000000..19e27d9 --- /dev/null +++ b/src/cli/grants-explorer/types/schemas.ts @@ -0,0 +1,40 @@ +import { z } from "zod"; + +export const CliArgsSchema = z.object({ + file: z.string().optional(), +}); + +export type CliArgs = z.infer; + +// OpenAI structured outputs doesn't allow union/anyOf at the root. +// We use one flat schema where `content` holds either the answer or +// a clarifying question, distinguished by `status`. +export const GrantsAgentOutputTypeSchema = z.object({ + response: z.object({ + status: z.enum(["final", "needs_clarification"]), + content: z.string().min(1), + }), +}); + +export const GrantsAgentOutputSchema = GrantsAgentOutputTypeSchema; + +export type GrantsAgentOutput = z.infer; + +// Single source of truth for one row loaded from the xlsx. Used as a runtime +// validation tripwire in XlsxLoader: if the per-cell normalizers ever produce +// a value that violates this shape (off-by-one bug, schema drift, etc.), the +// loader logs the failing row instead of silently inserting garbage into SQL. +export const GrantRowSchema = z.object({ + decision_date: z.string().nullable(), + recipient: z.string().nullable(), + granting_authority: z.string().nullable(), + case_number: z.string().nullable(), + amount_applied: z.number().int().nullable(), + amount_granted: z.number().int().nullable(), + has_eu_funding: z.union([z.literal(0), z.literal(1)]), + purpose: z.string().nullable(), + programme: z.string().nullable(), + region: z.string().nullable(), +}); + +export type GrantRow = z.infer; From bdd7d72e8566f0d3859335f91d5957fd10bd4efb Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Sun, 24 May 2026 22:49:45 +0300 Subject: [PATCH 2/5] feat: add --refetch flag to grants-explorer --- .claude/settings.json | 6 +- src/cli/grants-explorer/README.md | 11 + .../clients/xlsx-downloader.test.ts | 251 +++++++++++ .../clients/xlsx-downloader.ts | 392 ++++++++++++++++++ src/cli/grants-explorer/constants.ts | 2 + src/cli/grants-explorer/main.ts | 24 +- .../grants-explorer/should-refetch.test.ts | 21 + src/cli/grants-explorer/should-refetch.ts | 15 + src/cli/grants-explorer/types/schemas.test.ts | 64 +++ src/cli/grants-explorer/types/schemas.ts | 6 + 10 files changed, 787 insertions(+), 5 deletions(-) create mode 100644 src/cli/grants-explorer/clients/xlsx-downloader.test.ts create mode 100644 src/cli/grants-explorer/clients/xlsx-downloader.ts create mode 100644 src/cli/grants-explorer/should-refetch.test.ts create mode 100644 src/cli/grants-explorer/should-refetch.ts create mode 100644 src/cli/grants-explorer/types/schemas.test.ts diff --git a/.claude/settings.json b/.claude/settings.json index 414b774..5d49fd4 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -9,7 +9,8 @@ "Bash(pnpm format:*)", "Bash(pnpm format:check:*)", "Bash(pnpm test:*)", - "Bash(tsx scripts/scaffold-cli.ts:*)" + "Bash(tsx scripts/scaffold-cli.ts:*)", + "Bash(gh pr create:*)" ], "deny": [ "Bash(curl:*)", @@ -20,8 +21,7 @@ "Read(**/.env.*)", "Read(**/.env*)", "Read(**/secrets/**)", - "Bash(git push:*)", - "Bash(gh pr create:*)" + "Bash(git push:*)" ], "ask": [ "Bash(pnpm install:*)", diff --git a/src/cli/grants-explorer/README.md b/src/cli/grants-explorer/README.md index 57cb47b..b3229a7 100644 --- a/src/cli/grants-explorer/README.md +++ b/src/cli/grants-explorer/README.md @@ -7,11 +7,22 @@ Loads the Finnish grant-decisions workbook at `tmp/paatokset.xlsx` into an in-me ``` pnpm run:grants-explorer pnpm run:grants-explorer --file=tmp/paatokset.xlsx +pnpm run:grants-explorer --refetch ``` ## Arguments - `--file` (optional): path to the xlsx workbook. Defaults to `tmp/paatokset.xlsx`. +- `--refetch` (optional, presence-only flag): force-download the latest xlsx from [tutkihallintoa.fi](https://www.tutkihallintoa.fi/valtionavustukset/tutkiavustuksia/) before loading. Without the flag, the CLI uses the local file if present and auto-downloads only when it's missing. Pass it bare (`--refetch`) to enable; omit it to disable. Any explicit value (`--refetch=false`, `--refetch=true`, …) is rejected by the schema. + +## Source data + +`paatokset.xlsx` is downloaded from the Tutkiavustuksia.fi Power BI report, pre-filtered to: + +- Tab: **Avustusasiat** +- Slicer: **Sektoriluokitus = S15 Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt** (Non-profit institutions serving households) + +Other filter scopes (date ranges, other sectors, other tabs) are intentionally not exposed as CLI flags — broadening the scope would change which grants land in the SQL DB and invalidate any saved analyses. Filter per-query in SQL after load instead. ## Table schema diff --git a/src/cli/grants-explorer/clients/xlsx-downloader.test.ts b/src/cli/grants-explorer/clients/xlsx-downloader.test.ts new file mode 100644 index 0000000..e40f2b2 --- /dev/null +++ b/src/cli/grants-explorer/clients/xlsx-downloader.test.ts @@ -0,0 +1,251 @@ +import { + mkdtemp, + readdir, + readFile, + rm, + stat, + writeFile, +} from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { dirname, join } from "node:path"; +import { Logger } from "~clients/logger"; +import type { Mock } from "vitest"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import XLSX from "xlsx"; + +// === Test fixture: a tiny valid xlsx with the schema the downloader's +// post-download validation step expects (XlsxLoader). One header row + one +// data row is enough to satisfy "rows.length > 0". ========================== +const buildFixtureXlsx = (): Buffer => { + const wb = XLSX.utils.book_new(); + const ws = XLSX.utils.aoa_to_sheet([ + [ + "Päätös pvm", + "Saajan nimi", + "Myöntäjä", + "Asianumero", + "Haettu", + "Myönnetty", + "EU-varat", + "Hyväksytty käyttötarkoitus", + "Haun nimi (asianumero)", + "Alueet", + ], + [ + 46022, + "Test ry (1234567-8)", + "Test ELY-keskus", + "T-001", + 1000, + 800, + "", + "Test purpose", + "Test programme (key-1)", + "Test region", + ], + ]); + XLSX.utils.book_append_sheet(wb, ws, "Export"); + return XLSX.write(wb, { type: "buffer", bookType: "xlsx" }) as Buffer; +}; + +// === Playwright mock. We construct a minimal browser/page/frame tree that +// mirrors the downloader's call sequence. Tests inject failure points via +// the `behavior` argument. ================================================== +type Behavior = { + failTabClick?: boolean; + produceInvalidXlsx?: boolean; + fixture: Buffer; +}; + +let closeMock: Mock; + +const resolved = vi.fn(() => Promise.resolve()); + +// A chainable noop locator/handle. Most methods return the same object so +// `.first().click()`, `.filter({...}).first().click()`, etc. all resolve. +type ChainOverrides = { + click?: Mock; +}; +const makeChainable = (overrides: ChainOverrides = {}) => { + const chain: Record = { + click: overrides.click ?? resolved, + waitFor: resolved, + count: () => Promise.resolve(1), + boundingBox: () => + Promise.resolve({ x: 0, y: 0, width: 1600, height: 1100 }), + evaluate: () => Promise.resolve(), + scrollIntoViewIfNeeded: () => Promise.resolve(), + }; + chain.first = () => chain; + chain.last = () => chain; + chain.or = () => chain; + chain.filter = () => chain; + chain.locator = () => chain; + return chain; +}; + +const buildPlaywrightMock = (behavior: Behavior) => { + closeMock = vi.fn(() => Promise.resolve()); + + const tabClick = vi.fn(() => { + if (behavior.failTabClick) { + return Promise.reject(new Error("simulated tab click failure")); + } + return Promise.resolve(); + }); + + const frameMock = { + url: () => "https://app.powerbi.com/reportEmbed?reportId=demo", + evaluate: vi.fn(() => Promise.resolve(true)), + waitForTimeout: resolved, + evaluateHandle: vi.fn(() => + Promise.resolve({ + asElement: () => ({ + // Used by both the slicer-search input and the Myönteiset table + // visualContainer in the downloader's call sequence. + click: () => Promise.resolve(), + boundingBox: () => + Promise.resolve({ x: 100, y: 200, width: 800, height: 400 }), + scrollIntoViewIfNeeded: () => Promise.resolve(), + evaluate: () => Promise.resolve(), + }), + }) + ), + getByText: vi.fn((needle: string | RegExp) => { + const isAvustusasiat = + typeof needle === "string" && needle === "Avustusasiat"; + return makeChainable({ + click: isAvustusasiat ? tabClick : resolved, + }); + }), + locator: vi.fn(() => makeChainable()), + getByRole: vi.fn(() => + makeChainable({ + // The optional "confirm export" dialog isn't shown in tests; the + // downloader catches this rejection and continues. + click: vi.fn(() => Promise.reject(new Error("no confirmation dialog"))), + }) + ), + }; + + const downloadMock = { + saveAs: vi.fn((path: string) => { + const bytes = behavior.produceInvalidXlsx + ? Buffer.from("not a real xlsx", "utf8") + : behavior.fixture; + return writeFile(path, bytes); + }), + suggestedFilename: () => "paatokset.xlsx", + }; + + const pageMock = { + route: vi.fn(), + goto: resolved, + addStyleTag: resolved, + mouse: { move: resolved }, + waitForTimeout: resolved, + waitForEvent: vi.fn(() => Promise.resolve(downloadMock)), + keyboard: { press: resolved, type: resolved }, + frames: () => [frameMock as never], + locator: vi.fn(() => ({ + boundingBox: () => + Promise.resolve({ x: 0, y: 0, width: 1600, height: 1100 }), + })), + screenshot: resolved, + }; + + const contextMock = { + newPage: vi.fn(() => Promise.resolve(pageMock)), + }; + + return { + chromium: { + launch: vi.fn(() => + Promise.resolve({ + newContext: vi.fn(() => Promise.resolve(contextMock)), + close: closeMock, + }) + ), + }, + }; +}; + +// Module-scoped mock holders mutated by each test. +let playwrightMock: ReturnType; +vi.mock("playwright", () => ({ + get chromium() { + return playwrightMock.chromium; + }, +})); + +const silentLogger = new Logger({ + level: "error", + useColors: false, + useTimestamps: false, +}); + +describe("XlsxDownloader", () => { + let workDir: string; + let fixture: Buffer; + + beforeEach(async () => { + workDir = await mkdtemp(join(tmpdir(), "xlsx-downloader-test-")); + fixture = buildFixtureXlsx(); + }); + + afterEach(async () => { + await rm(workDir, { recursive: true, force: true }); + }); + + it("creates nested destination directories before writing", async () => { + playwrightMock = buildPlaywrightMock({ fixture }); + const { XlsxDownloader } = await import("./xlsx-downloader"); + const dest = join(workDir, "nested", "deep", "paatokset.xlsx"); + + await new XlsxDownloader({ + logger: silentLogger, + sourceUrl: "https://example.invalid/source", + }).download(dest); + + const info = await stat(dest); + expect(info.size).toBeGreaterThan(0); + }); + + it("closes the browser when navigation throws", async () => { + playwrightMock = buildPlaywrightMock({ fixture, failTabClick: true }); + const { XlsxDownloader } = await import("./xlsx-downloader"); + const dest = join(workDir, "paatokset.xlsx"); + + await expect( + new XlsxDownloader({ + logger: silentLogger, + sourceUrl: "https://example.invalid/source", + }).download(dest) + ).rejects.toThrow(/tab click failure/); + + expect(closeMock).toHaveBeenCalledTimes(1); + }); + + it("preserves the existing destination file when validation fails", async () => { + playwrightMock = buildPlaywrightMock({ fixture, produceInvalidXlsx: true }); + const { XlsxDownloader } = await import("./xlsx-downloader"); + const dest = join(workDir, "paatokset.xlsx"); + + // Pre-populate destination with the known-good fixture bytes. + await writeFile(dest, fixture); + const goodBytes = await readFile(dest); + + await expect( + new XlsxDownloader({ + logger: silentLogger, + sourceUrl: "https://example.invalid/source", + }).download(dest) + ).rejects.toThrow(); + + const afterBytes = await readFile(dest); + expect(afterBytes.equals(goodBytes)).toBe(true); + + const siblings = await readdir(dirname(dest)); + expect(siblings.some((f) => f.includes(".tmp-"))).toBe(false); + }); +}); diff --git a/src/cli/grants-explorer/clients/xlsx-downloader.ts b/src/cli/grants-explorer/clients/xlsx-downloader.ts new file mode 100644 index 0000000..2777c4c --- /dev/null +++ b/src/cli/grants-explorer/clients/xlsx-downloader.ts @@ -0,0 +1,392 @@ +import { randomUUID } from "node:crypto"; +import { mkdir, rename, unlink } from "node:fs/promises"; +import { dirname } from "node:path"; +import type { Logger } from "~clients/logger"; +import type { Browser, Frame, Page } from "playwright"; +import { chromium } from "playwright"; + +import { XlsxLoader } from "./xlsx-loader"; + +// Selectors locked in from manual discovery on 2026-05-24 against +// https://www.tutkihallintoa.fi/valtionavustukset/tutkiavustuksia/ — the +// wrapper embeds a Power BI report (https://app.powerbi.com/reportEmbed?...) +// with isMobile=true. We strip isMobile=true via page.route so we get the +// desktop variant (slicers + per-visual overflow menu). Power BI does not +// expose tab/menuitem ARIA roles, so navigation uses text + class selectors. +// +// Filter scope: this downloader reproduces the existing tmp/paatokset.xlsx — +// the "Avustusasiat" tab with the Sektoriluokitus slicer set to +// "S15 Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt". +// Other filter scopes are out of scope (see plan / README). + +const TAB_NAME = "Avustusasiat"; +const SLICER_LABEL = "Sektoriluokitus"; +// S15 = "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt" +// (Non-profit institutions serving households). The slicer's search input is +// "Hae"; typing the prefix "S15" filters the virtualized list to a single +// match, which we then click. +const SECTOR_OPTION_PREFIX = "S15"; + +const POWERBI_IFRAME_SELECTOR = 'iframe[src*="powerbi"]'; +const PBI_OVERFLOW_BTN_SELECTOR = ".vcMenuBtn"; +const PBI_OVERFLOW_BTN_ARIA = "Enemmän vaihtoehtoja"; + +const COOKIE_SUPPRESS_CSS = ` + .cc-banner, .cc-banner-wrapper, .cc-window, .cc-container { + display: none !important; + pointer-events: none !important; + visibility: hidden !important; + } +`; + +const REPORT_READY_TIMEOUT_MS = 90_000; +const DOWNLOAD_TIMEOUT_MS = 120_000; +const DEBUG_DIR = "tmp/grants-explorer/debug"; + +export type XlsxDownloaderOptions = { + logger: Logger; + sourceUrl: string; +}; + +/** + * Downloads the Avustusasiat (S15) grants xlsx from tutkihallintoa.fi via + * Power BI's per-visual Export menu, atomically replacing the destination + * only if the downloaded file parses successfully through XlsxLoader. + * + * Contract: + * - Never leaves the destination in a partial state. Writes to a sibling + * temp file, validates by parsing, then renames over destination. + * - On any failure, removes the temp file and leaves destination untouched. + * - Always closes the browser (finally), so a hung selector doesn't leak + * a Chromium process. + */ +export class XlsxDownloader { + private logger: Logger; + private sourceUrl: string; + + constructor({ logger, sourceUrl }: XlsxDownloaderOptions) { + this.logger = logger; + this.sourceUrl = sourceUrl; + } + + async download(destinationPath: string): Promise { + await mkdir(dirname(destinationPath), { recursive: true }); + const tempPath = `${destinationPath}.tmp-${Date.now()}-${randomUUID().slice(0, 8)}`; + + this.logger.info("Downloading paatokset.xlsx", { + sourceUrl: this.sourceUrl, + tempPath, + }); + + let browser: Browser | null = null; + let page: Page | null = null; + let step = "launch"; + try { + browser = await chromium.launch({ headless: true }); + const context = await browser.newContext({ + acceptDownloads: true, + locale: "fi-FI", + // 1100px viewport: Power BI's hover-to-reveal chrome only triggers + // when the visual being hovered fits within the viewport scroll + // window. A taller viewport breaks the hover detection. The + // export-options dialog button sits outside the viewport in this + // mode, so we confirm via keyboard Enter (the button is autofocus). + viewport: { width: 1600, height: 1100 }, + }); + page = await context.newPage(); + + await page.route("https://app.powerbi.com/reportEmbed**", (route) => { + const url = new URL(route.request().url()); + if (url.searchParams.get("isMobile") === "true") { + url.searchParams.delete("isMobile"); + return route.continue({ url: url.toString() }); + } + return route.continue(); + }); + + step = "goto"; + await page.goto(this.sourceUrl, { + waitUntil: "domcontentloaded", + timeout: 60_000, + }); + await page.addStyleTag({ content: COOKIE_SUPPRESS_CSS }); + + step = "report-ready"; + const report = await this.waitForReportFrame(page); + + step = "tab-click"; + await this.clickAvustusasiatTab(report); + + step = "slicer-filter"; + await this.applySectorFilter(report, page); + + step = "open-export"; + const download = await this.openExportAndDownload(report, page); + + step = "save"; + await download.saveAs(tempPath); + this.logger.info("Saved download to temp path", { tempPath }); + + step = "validate"; + const rows = new XlsxLoader({ logger: this.logger }).load(tempPath); + if (rows.length === 0) { + throw new Error("Downloaded xlsx parsed to 0 rows"); + } + this.logger.info("Validated download", { tempPath, rows: rows.length }); + + step = "rename"; + await rename(tempPath, destinationPath); + this.logger.info("Refetched paatokset.xlsx", { destinationPath }); + } catch (error) { + this.logger.error("XlsxDownloader failed", { + step, + error: error instanceof Error ? error.message : String(error), + }); + if (page) { + await this.debugSnapshot(page, step); + } + await this.cleanupTemp(tempPath); + throw error; + } finally { + if (browser) { + await browser.close(); + } + } + } + + private async waitForReportFrame(page: Page): Promise { + const start = Date.now(); + while (Date.now() - start < REPORT_READY_TIMEOUT_MS) { + const report = page + .frames() + .find((frame) => /powerbi/i.test(frame.url())); + if (report) { + try { + const ready = await report.evaluate(() => + /Sektoriluokitus|Avustusasiat/.test(document.body.innerText) + ); + if (ready) { + this.logger.debug("Power BI report frame ready", { + elapsedMs: Date.now() - start, + }); + return report; + } + } catch { + // Frame navigating; retry. + } + } + await page.waitForTimeout(1000); + } + await this.debugSnapshot(page, "report-ready"); + throw new Error("Timed out waiting for Power BI report frame"); + } + + private async clickAvustusasiatTab(report: Frame): Promise { + const tab = report.getByText(TAB_NAME, { exact: true }).first(); + await tab.waitFor({ state: "visible", timeout: 15_000 }); + await tab.click({ timeout: 10_000 }); + // Wait for the page-switch to settle. + await report.waitForTimeout(3000); + this.logger.debug("Clicked report tab", { tab: TAB_NAME }); + } + + private async applySectorFilter(report: Frame, page: Page): Promise { + // The Sektoriluokitus slicer is a dropdown with aria-label + // "Sektoriluokituskoodi- ja nimi" (the underlying column name). It + // opens a virtualized list (~8 visible at a time) with a search input + // labeled "Hae" — typing into that input filters the list to matching + // items, which we then click. + const dropdown = report + .locator('[aria-label="Sektoriluokituskoodi- ja nimi"]') + .first(); + await dropdown.waitFor({ state: "visible", timeout: 15_000 }); + await dropdown.click({ timeout: 10_000 }); + this.logger.debug("Opened slicer dropdown", { slicer: SLICER_LABEL }); + await page.waitForTimeout(1500); + + // Find the search input by enumerating all "Hae" inputs and picking the + // visible one (only one is visible at a time — the one inside the + // just-opened dropdown). + const searchHandle = await report.evaluateHandle(() => { + const inputs = [ + ...document.querySelectorAll( + 'input[aria-label="Hae"], input[placeholder="Hae"]' + ), + ]; + return ( + inputs.find((el) => { + const r = el.getBoundingClientRect(); + return r.width > 0 && r.height > 0; + }) ?? null + ); + }); + const searchEl = searchHandle.asElement(); + if (!searchEl) { + throw new Error( + "No visible 'Hae' search input found after opening slicer dropdown" + ); + } + // Click to focus, then type. fill() can race with Power BI's own + // focus management; click+type is more robust. + // Click to focus, then keyboard.type to enter text. Locator.fill() can + // race with Power BI's own focus management; an explicit focus+type + // through page.keyboard is more reliable. + await searchEl.click(); + await page.keyboard.type(SECTOR_OPTION_PREFIX, { delay: 30 }); + this.logger.debug("Typed slicer search query", { + query: SECTOR_OPTION_PREFIX, + }); + await page.waitForTimeout(1500); + + // After the search filter applies, S15 should be the (only) remaining + // option. Power BI renders options as rows in a listbox; the visible + // text node matching the prefix is the click target. + const option = report + .getByText(new RegExp(`^${SECTOR_OPTION_PREFIX}\\s`)) + .first(); + await option.waitFor({ state: "visible", timeout: 10_000 }); + await option.click({ timeout: 10_000 }); + this.logger.debug("Selected sector option", { + sector: SECTOR_OPTION_PREFIX, + }); + + // Close the dropdown so it doesn't obscure the per-visual overflow menu. + await page.keyboard.press("Escape"); + await page.waitForTimeout(2000); + } + + private async openExportAndDownload(report: Frame, page: Page) { + // The Avustusasiat page renders two data tables: "Myönteiset päätökset" + // (positive grant decisions — what we want, matches paatokset.xlsx) and + // "Kielteiset päätökset" (negative decisions, below). Both expose a + // per-visual overflow button (.vcMenuBtn / aria "Enemmän vaihtoehtoja") + // when hovered. We must target the *positive* visual specifically. + const POSITIVE_TABLE_LABEL_PREFIX = "Myönteiset päätökset"; + const tableHandle = await report.evaluateHandle((prefix: string) => { + const visuals = [...document.querySelectorAll(".visualContainer")]; + return ( + visuals.find((el) => + (el.getAttribute("aria-label") ?? "").startsWith(prefix) + ) ?? null + ); + }, POSITIVE_TABLE_LABEL_PREFIX); + const tableEl = tableHandle.asElement(); + if (!tableEl) { + throw new Error( + `Could not locate "${POSITIVE_TABLE_LABEL_PREFIX}" visual on Avustusasiat` + ); + } + + // Scroll the positive-decisions visual into view so the table's + // overflow button is positioned at a known location inside the iframe. + await tableEl.scrollIntoViewIfNeeded(); + await page.waitForTimeout(1000); + + // Hover the centre of the visual to trigger Power BI's overflow + // chrome. Empirically, centre-hover reveals the .vcMenuBtn; top-right + // hover does not (the buttons aren't anchored to the title bar). + const box = await tableEl.boundingBox(); + if (!box) { + throw new Error("Positive-decisions table has no bounding box"); + } + const iframeBox = await page.locator(POWERBI_IFRAME_SELECTOR).boundingBox(); + if (!iframeBox) { + throw new Error("Power BI iframe has no bounding box"); + } + await page.mouse.move( + iframeBox.x + box.x + box.width / 2, + iframeBox.y + box.y + box.height / 2, + { steps: 5 } + ); + await page.waitForTimeout(1500); + + // .vcMenuBtn is a sibling of .visualContainer (not a child) — Power BI + // renders visual chrome in a separate overlay layer. Since only the + // currently-hovered visual has its button visible, an unscoped lookup + // for the *visible* .vcMenuBtn safely targets the Myönteiset table. + const overflow = report + .locator( + `${PBI_OVERFLOW_BTN_SELECTOR}, [aria-label="${PBI_OVERFLOW_BTN_ARIA}"]` + ) + .first(); + await overflow.waitFor({ state: "visible", timeout: 10_000 }); + await overflow.click({ timeout: 10_000, force: true }); + this.logger.debug("Opened per-visual overflow menu", { + visual: POSITIVE_TABLE_LABEL_PREFIX, + }); + await page.waitForTimeout(1000); + + // Click "Vie tiedot" — this opens an export-options dialog (Mitkä + // tiedot haluat viedä?) with format pre-selected to .xlsx and a "Vie" + // primary button. The download fires when we click the dialog's Vie. + const exportMenu = report.getByText(/^(Vie tiedot|Export data)/i).first(); + await exportMenu.waitFor({ state: "visible", timeout: 8_000 }); + await exportMenu.click({ timeout: 10_000 }); + this.logger.debug('Clicked "Vie tiedot" menu item'); + + // Wait for the export-options dialog by its heading. + const dialog = report + .locator('[role="dialog"]') + .filter({ hasText: /Mitkä tiedot haluat viedä|What data do you want/i }) + .first(); + await dialog.waitFor({ state: "visible", timeout: 10_000 }); + + // Default format is already .xlsx (max 150 000 rows) — no need to + // change it. The primary action button is "Vie" with class + // pbi-modern-button.primaryBtn.exportButton (aria-label="Vie"). + const downloadPromise = page.waitForEvent("download", { + timeout: DOWNLOAD_TIMEOUT_MS, + }); + // If something below throws, downloadPromise stays unawaited and Node + // would log an unhandled-rejection when the browser closes in finally. + // Attach a no-op catch so the error surfaces via the thrown click error, + // not as a separate noisy rejection. + downloadPromise.catch(() => { + // intentional: surface the real error from the click path below + }); + // Invoke the dialog's "Vie" button's DOM .click() directly. Playwright's + // user-mode click refuses to fire because the iframe is taller than the + // viewport — the button's page-y is outside Playwright's actionability + // window, even with force=true. The button has data-testid="export-btn". + // A JS-level .click() bypasses positioning entirely and triggers the + // same handler the user click would. + const exportConfirm = dialog + .locator( + 'button[data-testid="export-btn"], button[aria-label="Vie"], button.exportButton.primaryBtn' + ) + .first(); + await exportConfirm.waitFor({ state: "attached", timeout: 5_000 }); + await exportConfirm.evaluate((el) => { + (el as HTMLElement).click(); + }); + this.logger.debug("Confirmed export dialog via DOM click"); + + const download = await downloadPromise; + this.logger.debug("Download event received", { + suggestedFilename: download.suggestedFilename(), + }); + return download; + } + + private async debugSnapshot(page: Page, step: string): Promise { + try { + await mkdir(DEBUG_DIR, { recursive: true }); + const path = `${DEBUG_DIR}/${step}-${Date.now()}.png`; + await page.screenshot({ path, fullPage: false }); + this.logger.warn("Saved debug screenshot", { step, path }); + } catch (error) { + this.logger.debug("Failed to write debug screenshot", { + step, + error: error instanceof Error ? error.message : String(error), + }); + } + } + + private async cleanupTemp(tempPath: string): Promise { + try { + await unlink(tempPath); + } catch { + // Temp file may not exist yet (e.g. failure before saveAs). + } + } +} diff --git a/src/cli/grants-explorer/constants.ts b/src/cli/grants-explorer/constants.ts index 8f622cd..6ae67a9 100644 --- a/src/cli/grants-explorer/constants.ts +++ b/src/cli/grants-explorer/constants.ts @@ -1,3 +1,5 @@ export const AGENT_NAME = "GrantsExplorerAgent"; export const AGENT_MODEL = "gpt-5-mini"; export const DEFAULT_XLSX_PATH = "tmp/paatokset.xlsx"; +export const PAATOKSET_SOURCE_URL = + "https://www.tutkihallintoa.fi/valtionavustukset/tutkiavustuksia/"; diff --git a/src/cli/grants-explorer/main.ts b/src/cli/grants-explorer/main.ts index 1ac8887..ab57890 100644 --- a/src/cli/grants-explorer/main.ts +++ b/src/cli/grants-explorer/main.ts @@ -1,16 +1,25 @@ // pnpm run:grants-explorer // pnpm run:grants-explorer --file=tmp/paatokset.xlsx +// pnpm run:grants-explorer --refetch import "dotenv/config"; +import { existsSync } from "node:fs"; import { AgentRunner } from "~clients/agent-runner"; import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; import { QuestionHandler } from "~utils/question-handler"; import { GrantsDatabase } from "./clients/database"; +import { XlsxDownloader } from "./clients/xlsx-downloader"; import { XlsxLoader } from "./clients/xlsx-loader"; -import { AGENT_MODEL, AGENT_NAME, DEFAULT_XLSX_PATH } from "./constants"; +import { + AGENT_MODEL, + AGENT_NAME, + DEFAULT_XLSX_PATH, + PAATOKSET_SOURCE_URL, +} from "./constants"; +import { shouldRefetch } from "./should-refetch"; import { createSqlQueryTool } from "./tools/sql-tool"; import { CliArgsSchema, @@ -23,9 +32,20 @@ const logger = new Logger(); let db: GrantsDatabase | null = null; try { - const { file } = parseArgs({ logger, schema: CliArgsSchema }); + const { file, refetch } = parseArgs({ logger, schema: CliArgsSchema }); const xlsxPath = file ?? DEFAULT_XLSX_PATH; + const exists = existsSync(xlsxPath); + if (shouldRefetch({ refetch, exists })) { + if (!exists) { + logger.info("Local xlsx missing; downloading", { xlsxPath }); + } + await new XlsxDownloader({ + logger, + sourceUrl: PAATOKSET_SOURCE_URL, + }).download(xlsxPath); + } + const rows = new XlsxLoader({ logger }).load(xlsxPath); db = new GrantsDatabase(logger); diff --git a/src/cli/grants-explorer/should-refetch.test.ts b/src/cli/grants-explorer/should-refetch.test.ts new file mode 100644 index 0000000..f17304d --- /dev/null +++ b/src/cli/grants-explorer/should-refetch.test.ts @@ -0,0 +1,21 @@ +import { describe, expect, it } from "vitest"; + +import { shouldRefetch } from "./should-refetch"; + +describe("shouldRefetch", () => { + it("does not refetch when file exists and refetch flag is off", () => { + expect(shouldRefetch({ refetch: false, exists: true })).toBe(false); + }); + + it("auto-downloads when the local file is missing", () => { + expect(shouldRefetch({ refetch: false, exists: false })).toBe(true); + }); + + it("forces refresh when --refetch is passed", () => { + expect(shouldRefetch({ refetch: true, exists: true })).toBe(true); + }); + + it("downloads when --refetch is passed and file is missing", () => { + expect(shouldRefetch({ refetch: true, exists: false })).toBe(true); + }); +}); diff --git a/src/cli/grants-explorer/should-refetch.ts b/src/cli/grants-explorer/should-refetch.ts new file mode 100644 index 0000000..7b3f162 --- /dev/null +++ b/src/cli/grants-explorer/should-refetch.ts @@ -0,0 +1,15 @@ +/** + * Pure decision function for whether to (re-)download the xlsx before + * loading. Auto-downloads when the local file is missing; otherwise the + * user must explicitly pass `--refetch` to force a refresh. + * + * Lives in its own module so main.test.ts can import it without triggering + * main.ts's top-level CLI bootstrap. + */ +export const shouldRefetch = ({ + refetch, + exists, +}: { + refetch: boolean; + exists: boolean; +}): boolean => refetch || !exists; diff --git a/src/cli/grants-explorer/types/schemas.test.ts b/src/cli/grants-explorer/types/schemas.test.ts new file mode 100644 index 0000000..f725970 --- /dev/null +++ b/src/cli/grants-explorer/types/schemas.test.ts @@ -0,0 +1,64 @@ +import { Logger } from "~clients/logger"; +import { parseArgs } from "~utils/parse-args"; +import { describe, expect, it } from "vitest"; + +import { CliArgsSchema } from "./schemas"; + +const silentLogger = new Logger({ + level: "error", + useColors: false, + useTimestamps: false, +}); + +describe("CliArgsSchema (grants-explorer)", () => { + it("defaults refetch to false when the flag is absent", () => { + const args = parseArgs({ + logger: silentLogger, + schema: CliArgsSchema, + rawArgs: [], + }); + expect(args.refetch).toBe(false); + expect(args.file).toBeUndefined(); + }); + + it("enables refetch when --refetch is present (no value)", () => { + const args = parseArgs({ + logger: silentLogger, + schema: CliArgsSchema, + rawArgs: ["--refetch"], + }); + expect(args.refetch).toBe(true); + }); + + it("accepts --file= as a path string", () => { + const args = parseArgs({ + logger: silentLogger, + schema: CliArgsSchema, + rawArgs: ["--file=tmp/other.xlsx"], + }); + expect(args.file).toBe("tmp/other.xlsx"); + expect(args.refetch).toBe(false); + }); + + // Pins the safer behavior: `--refetch` is presence-only, so any explicit + // value (`--refetch=false`, `--refetch=true`, `--refetch=foo`) is rejected + // by the schema rather than silently doing something surprising. The old + // z.coerce.boolean() form would have made `--refetch=false` truthy and + // clobbered the cached workbook — see the comment in schemas.ts. + it("rejects --refetch= (presence-only flag)", () => { + expect(() => + parseArgs({ + logger: silentLogger, + schema: CliArgsSchema, + rawArgs: ["--refetch=false"], + }) + ).toThrow(); + expect(() => + parseArgs({ + logger: silentLogger, + schema: CliArgsSchema, + rawArgs: ["--refetch=true"], + }) + ).toThrow(); + }); +}); diff --git a/src/cli/grants-explorer/types/schemas.ts b/src/cli/grants-explorer/types/schemas.ts index 19e27d9..a49d43d 100644 --- a/src/cli/grants-explorer/types/schemas.ts +++ b/src/cli/grants-explorer/types/schemas.ts @@ -2,6 +2,12 @@ import { z } from "zod"; export const CliArgsSchema = z.object({ file: z.string().optional(), + // Presence-only flag. parseArgv hands us bare `true` for `--refetch` and + // `undefined` when absent. Any `--refetch=` form arrives as a string + // and is rejected here — preventing the historical `z.coerce.boolean()` + // foot-gun where `--refetch=false` would silently *enable* refetch and + // clobber the cached workbook. + refetch: z.boolean().default(false), }); export type CliArgs = z.infer; From 8879f98fef03df8e50b7c937123bb6ce1f72b1e7 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Mon, 25 May 2026 22:37:10 +0300 Subject: [PATCH 3/5] feat: extract recipient_business_id into indexed column --- src/cli/grants-explorer/README.md | 27 +++--- .../grants-explorer/clients/database.test.ts | 38 +++++++++ src/cli/grants-explorer/clients/database.ts | 9 +- .../clients/xlsx-loader.test.ts | 84 ++++++++++++++++++- .../grants-explorer/clients/xlsx-loader.ts | 17 +++- src/cli/grants-explorer/main.ts | 6 +- src/cli/grants-explorer/tools/sql-tool.ts | 9 +- src/cli/grants-explorer/types/schemas.ts | 8 ++ .../grants-explorer/utils/business-id.test.ts | 35 ++++++++ src/cli/grants-explorer/utils/business-id.ts | 16 ++++ 10 files changed, 227 insertions(+), 22 deletions(-) create mode 100644 src/cli/grants-explorer/utils/business-id.test.ts create mode 100644 src/cli/grants-explorer/utils/business-id.ts diff --git a/src/cli/grants-explorer/README.md b/src/cli/grants-explorer/README.md index b3229a7..752f6ce 100644 --- a/src/cli/grants-explorer/README.md +++ b/src/cli/grants-explorer/README.md @@ -26,21 +26,24 @@ Other filter scopes (date ranges, other sectors, other tabs) are intentionally n ## Table schema -| Column | Type | Source header | -| -------------------- | ------- | ---------------------------- | -| `decision_date` | TEXT | Päätös pvm (ISO date) | -| `recipient` | TEXT | Saajan nimi (incl. y-tunnus) | -| `granting_authority` | TEXT | Myöntäjä | -| `case_number` | TEXT | Asianumero | -| `amount_applied` | INTEGER | Haettu (EUR, nullable) | -| `amount_granted` | INTEGER | Myönnetty (EUR, nullable) | -| `has_eu_funding` | INTEGER | EU-varat (0/1) | -| `purpose` | TEXT | Hyväksytty käyttötarkoitus | -| `programme` | TEXT | Haun nimi (asianumero) | -| `region` | TEXT | Alueet | +| Column | Type | Source header | +| ----------------------- | ------- | -------------------------------------------------- | +| `decision_date` | TEXT | Päätös pvm (ISO date) | +| `recipient` | TEXT | Saajan nimi (full original string, incl. y-tunnus) | +| `recipient_business_id` | TEXT | Y-tunnus extracted from Saajan nimi (indexed) | +| `granting_authority` | TEXT | Myöntäjä | +| `case_number` | TEXT | Asianumero | +| `amount_applied` | INTEGER | Haettu (EUR, nullable) | +| `amount_granted` | INTEGER | Myönnetty (EUR, nullable) | +| `has_eu_funding` | INTEGER | EU-varat (0/1) | +| `purpose` | TEXT | Hyväksytty käyttötarkoitus | +| `programme` | TEXT | Haun nimi (asianumero) | +| `region` | TEXT | Alueet | `amount_applied` / `amount_granted` are nullable so an unknown amount stays distinguishable from a real `0 €` decision in aggregates. +`recipient_business_id` is `NULL` for recipients that don't have a Finnish Business ID — private persons, foreign entities, and ad-hoc working groups. The loader logs the count of such rows under `recipientsWithoutBusinessId`. Use `recipient_business_id = ''` for indexed equality lookups and `GROUP BY recipient_business_id` to aggregate per legal entity. + ## Example session ``` diff --git a/src/cli/grants-explorer/clients/database.test.ts b/src/cli/grants-explorer/clients/database.test.ts index 2d86e31..090a104 100644 --- a/src/cli/grants-explorer/clients/database.test.ts +++ b/src/cli/grants-explorer/clients/database.test.ts @@ -13,6 +13,7 @@ const silentLogger = new Logger({ const row = (overrides: Partial = {}): GrantRow => ({ decision_date: "2026-01-15", recipient: "Test ry", + recipient_business_id: null, granting_authority: "Lapin ELY-keskus", case_number: "001", amount_applied: 1000, @@ -89,6 +90,43 @@ describe("GrantsDatabase", () => { expect(result[0]?.recipient).toBe("Lapin Martat ry (0210606-0)"); }); + it("equality query on recipient_business_id returns the matching row", () => { + db.insertRows([ + row({ + recipient: "Lapin Martat ry (0210606-0)", + recipient_business_id: "0210606-0", + }), + row({ + recipient: "Rikala-seura ry (2477520-6)", + recipient_business_id: "2477520-6", + }), + ]); + const result = db.query<{ recipient: string }>( + "SELECT recipient FROM grants WHERE recipient_business_id = ?", + ["0210606-0"] + ); + expect(result).toHaveLength(1); + expect(result[0]?.recipient).toBe("Lapin Martat ry (0210606-0)"); + }); + + it("recipient_business_id is genuinely nullable (round-trips NULL)", () => { + db.insertRows([ + row({ + recipient: "Anonymous private grantee", + recipient_business_id: null, + }), + row({ + recipient: "Lapin Martat ry (0210606-0)", + recipient_business_id: "0210606-0", + }), + ]); + const result = db.query<{ recipient: string }>( + "SELECT recipient FROM grants WHERE recipient_business_id IS NULL" + ); + expect(result).toHaveLength(1); + expect(result[0]?.recipient).toBe("Anonymous private grantee"); + }); + it("CHECK constraint rejects out-of-range has_eu_funding", () => { expect(() => { db.insertRows([row({ has_eu_funding: 2 as unknown as 0 | 1 })]); diff --git a/src/cli/grants-explorer/clients/database.ts b/src/cli/grants-explorer/clients/database.ts index a9d97b2..77c515a 100644 --- a/src/cli/grants-explorer/clients/database.ts +++ b/src/cli/grants-explorer/clients/database.ts @@ -26,6 +26,7 @@ export class GrantsDatabase { id INTEGER PRIMARY KEY AUTOINCREMENT, decision_date TEXT, recipient TEXT, + recipient_business_id TEXT, granting_authority TEXT, case_number TEXT, amount_applied INTEGER, @@ -37,6 +38,7 @@ export class GrantsDatabase { ); CREATE INDEX idx_grants_granting_authority ON grants(granting_authority); + CREATE INDEX idx_grants_recipient_business_id ON grants(recipient_business_id); CREATE INDEX idx_grants_region ON grants(region); CREATE INDEX idx_grants_decision_date ON grants(decision_date); CREATE INDEX idx_grants_has_eu_funding ON grants(has_eu_funding); @@ -47,10 +49,10 @@ export class GrantsDatabase { insertRows(rows: GrantRow[]): void { const insert = this.db.prepare(` INSERT INTO grants ( - decision_date, recipient, granting_authority, case_number, - amount_applied, amount_granted, has_eu_funding, + decision_date, recipient, recipient_business_id, granting_authority, + case_number, amount_applied, amount_granted, has_eu_funding, purpose, programme, region - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `); this.db.exec("BEGIN"); @@ -59,6 +61,7 @@ export class GrantsDatabase { insert.run( row.decision_date, row.recipient, + row.recipient_business_id, row.granting_authority, row.case_number, row.amount_applied, diff --git a/src/cli/grants-explorer/clients/xlsx-loader.test.ts b/src/cli/grants-explorer/clients/xlsx-loader.test.ts index 8c7935a..f5bbe3a 100644 --- a/src/cli/grants-explorer/clients/xlsx-loader.test.ts +++ b/src/cli/grants-explorer/clients/xlsx-loader.test.ts @@ -1,4 +1,8 @@ -import { describe, expect, it } from "vitest"; +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { Logger } from "~clients/logger"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; import XLSX from "xlsx"; import { @@ -6,8 +10,54 @@ import { normalizeEuFunding, normalizeExcelDate, normalizeText, + XlsxLoader, } from "./xlsx-loader"; +const silentLogger = new Logger({ + level: "error", + useColors: false, + useTimestamps: false, +}); + +const HEADER_ROW = [ + "Päätös pvm", + "Saajan nimi", + "Myöntäjä", + "Asianumero", + "Haettu", + "Myönnetty", + "EU-varat", + "Hyväksytty käyttötarkoitus", + "Haun nimi (asianumero)", + "Alueet", +]; + +const dataRow = (recipient: string) => [ + 46022, + recipient, + "Test ELY-keskus", + "T-001", + 1000, + 800, + "", + "Test purpose", + "Test programme (key-1)", + "Test region", +]; + +const writeFixtureXlsx = async ( + workDir: string, + rows: string[] +): Promise => { + const wb = XLSX.utils.book_new(); + const ws = XLSX.utils.aoa_to_sheet([HEADER_ROW, ...rows.map(dataRow)]); + XLSX.utils.book_append_sheet(wb, ws, "Export"); + const buf = XLSX.write(wb, { type: "buffer", bookType: "xlsx" }) as Buffer; + const filePath = join(workDir, "fixture.xlsx"); + await writeFile(filePath, buf); + return filePath; +}; + type ParsedDateCode = { y: number; m: number; d: number }; const ssf = XLSX.SSF as { parse_date_code: (n: number) => ParsedDateCode | undefined; @@ -105,3 +155,35 @@ describe("normalizeText", () => { expect(normalizeText(" ")).toBeNull(); }); }); + +// Loader integration: builds a minimal real xlsx through SheetJS, writes it +// to a temp file, and runs the full load() pipeline. This protects against +// regressions where extractBusinessId is wired up wrong and recipient_business_id +// ends up always-null while the parser unit test still passes. +describe("XlsxLoader.load() — recipient_business_id wiring", () => { + let workDir: string; + + beforeEach(async () => { + workDir = await mkdtemp(join(tmpdir(), "xlsx-loader-test-")); + }); + + afterEach(async () => { + await rm(workDir, { recursive: true, force: true }); + }); + + it("extracts recipient_business_id and preserves the full recipient string", async () => { + const filePath = await writeFixtureXlsx(workDir, [ + "Lapin Martat ry (0210606-0)", + "Anonymous private grantee", + ]); + + const rows = new XlsxLoader({ logger: silentLogger }).load(filePath); + + expect(rows).toHaveLength(2); + expect(rows[0]?.recipient).toBe("Lapin Martat ry (0210606-0)"); + expect(rows[0]?.recipient_business_id).toBe("0210606-0"); + // Bare-name recipient is still inserted; only the business_id is null. + expect(rows[1]?.recipient).toBe("Anonymous private grantee"); + expect(rows[1]?.recipient_business_id).toBeNull(); + }); +}); diff --git a/src/cli/grants-explorer/clients/xlsx-loader.ts b/src/cli/grants-explorer/clients/xlsx-loader.ts index e49e7af..5769838 100644 --- a/src/cli/grants-explorer/clients/xlsx-loader.ts +++ b/src/cli/grants-explorer/clients/xlsx-loader.ts @@ -3,6 +3,7 @@ import XLSX from "xlsx"; import type { GrantRow } from "../types/schemas"; import { GrantRowSchema } from "../types/schemas"; +import { extractBusinessId } from "../utils/business-id"; type RawCell = string | number | boolean | Date | null; type RawRow = RawCell[]; @@ -203,9 +204,13 @@ export class XlsxLoader { }); } + // Normalize recipient once so the null-propagation chain (null cell → + // null recipient → null business_id) is obvious and we don't trim twice. + const recipient = normalizeText(at("recipient")); const candidate = { decision_date: decisionDate, - recipient: normalizeText(at("recipient")), + recipient, + recipient_business_id: extractBusinessId(recipient), granting_authority: normalizeText(at("granting_authority")), case_number: normalizeText(at("case_number")), amount_applied: normalizeAmount(at("amount_applied")), @@ -238,11 +243,21 @@ export class XlsxLoader { ); } + // Count loaded rows where the recipient string is present but no + // y-tunnus could be extracted (private persons, foreign entities, + // working groups). Surfacing this in the loader summary makes silent + // source-format drift visible — e.g. if a future export starts placing + // y-tunnus somewhere other than the trailing parenthetical. + const recipientsWithoutBusinessId = rows.filter( + (r) => r.recipient !== null && r.recipient_business_id === null + ).length; + this.logger.info("Loaded xlsx rows", { filePath, rowCount: rows.length, dateNormalizationFailures, validationFailures, + recipientsWithoutBusinessId, }); return rows; } diff --git a/src/cli/grants-explorer/main.ts b/src/cli/grants-explorer/main.ts index ab57890..4bd2d59 100644 --- a/src/cli/grants-explorer/main.ts +++ b/src/cli/grants-explorer/main.ts @@ -65,7 +65,8 @@ Use the \`query_grants\` tool to run SQL SELECT queries against the \`grants\` t Schema (English column | source Finnish header): - decision_date (Päätös pvm) ISO date 'YYYY-MM-DD', may be NULL -- recipient (Saajan nimi) incl. y-tunnus in parens, e.g. "Lapin Martat ry (0210606-0)" +- recipient (Saajan nimi) full original name, e.g. "Lapin Martat ry (0210606-0)" +- recipient_business_id (extracted from Saajan nimi) Y-tunnus only, e.g. "0210606-0"; NULL for recipients without one (private persons, foreign entities, working groups). Indexed. - granting_authority (Myöntäjä) e.g. "Lapin ELY-keskus" - case_number (Asianumero) TEXT - amount_applied (Haettu) EUR, may be NULL @@ -77,7 +78,8 @@ Schema (English column | source Finnish header): Notes: - Amounts and dates can be NULL; SUM/AVG handle that correctly. -- For y-tunnus searches use LIKE '%%' on recipient. +- For Y-tunnus equality searches use recipient_business_id = '' (preferred — indexed and exact). The full string is also available in recipient for substring/name matching. +- To aggregate grants per legal entity, GROUP BY recipient_business_id (and filter out NULL when only registered entities are wanted). - Answer in the language of the user's question (Finnish or English). - Be concise and grounded in the SQL results; don't invent numbers. diff --git a/src/cli/grants-explorer/tools/sql-tool.ts b/src/cli/grants-explorer/tools/sql-tool.ts index 01855e9..6521de2 100644 --- a/src/cli/grants-explorer/tools/sql-tool.ts +++ b/src/cli/grants-explorer/tools/sql-tool.ts @@ -48,7 +48,8 @@ Table: grants Columns (English name | source Finnish header | type): - id | (auto) | INTEGER PRIMARY KEY - decision_date | Päätös pvm | TEXT, ISO date 'YYYY-MM-DD', may be NULL -- recipient | Saajan nimi | TEXT, includes y-tunnus in parentheses (e.g. "Lapin Martat ry (0210606-0)") +- recipient | Saajan nimi | TEXT, full original name string (e.g. "Lapin Martat ry (0210606-0)") +- recipient_business_id | (extracted from Saajan nimi) | TEXT, Y-tunnus only (e.g. "0210606-0"), NULL when the recipient has no business ID (private persons, foreign entities, working groups). Indexed. - granting_authority | Myöntäjä | TEXT, e.g. "Lapin ELY-keskus" - case_number | Asianumero | TEXT - amount_applied | Haettu | INTEGER, EUR, may be NULL @@ -61,7 +62,7 @@ Columns (English name | source Finnish header | type): Rules: - Only one SELECT statement; no semicolons, no DDL/DML keywords. - Amounts and dates can be NULL; use IS NULL / IS NOT NULL where it matters. -- Use LIKE for partial text matches (e.g. y-tunnus inside recipient). +- For Y-tunnus searches prefer the indexed equality column: recipient_business_id = ''. LIKE on recipient still works for partial-name matches. Example queries: - Total granted per authority: @@ -69,7 +70,9 @@ Example queries: - Top 5 single grants: SELECT decision_date, recipient, amount_granted FROM grants ORDER BY amount_granted DESC LIMIT 5 - EU-funded vs. not: - SELECT has_eu_funding, COUNT(*) AS n, SUM(amount_granted) AS sum_eur FROM grants GROUP BY has_eu_funding`, + SELECT has_eu_funding, COUNT(*) AS n, SUM(amount_granted) AS sum_eur FROM grants GROUP BY has_eu_funding +- Top 10 recipients by total granted (one row per legal entity): + SELECT recipient_business_id, MAX(recipient) AS name, SUM(amount_granted) AS total FROM grants WHERE recipient_business_id IS NOT NULL GROUP BY recipient_business_id ORDER BY total DESC LIMIT 10`, parameters: z.object({ sql: z.string().describe("A single SQL SELECT query"), }), diff --git a/src/cli/grants-explorer/types/schemas.ts b/src/cli/grants-explorer/types/schemas.ts index a49d43d..014c2a0 100644 --- a/src/cli/grants-explorer/types/schemas.ts +++ b/src/cli/grants-explorer/types/schemas.ts @@ -33,6 +33,14 @@ export type GrantsAgentOutput = z.infer; export const GrantRowSchema = z.object({ decision_date: z.string().nullable(), recipient: z.string().nullable(), + // Y-tunnus (Finnish Business ID) extracted from `recipient` parens. The + // regex constraint can never reject a legitimately-loaded row — the loader + // emits only matching values or null — but it serves as a tripwire if a + // future refactor accidentally pipes the wrong field in. + recipient_business_id: z + .string() + .regex(/^\d{7}-\d$/) + .nullable(), granting_authority: z.string().nullable(), case_number: z.string().nullable(), amount_applied: z.number().int().nullable(), diff --git a/src/cli/grants-explorer/utils/business-id.test.ts b/src/cli/grants-explorer/utils/business-id.test.ts new file mode 100644 index 0000000..84d667d --- /dev/null +++ b/src/cli/grants-explorer/utils/business-id.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, it } from "vitest"; + +import { extractBusinessId } from "./business-id"; + +describe("extractBusinessId", () => { + it.each([ + ["Suomen elokuvasäätiö sr (0202113-9)", "0202113-9"], + ["Lapin Martat ry (0210606-0)", "0210606-0"], + // Defensive: recipient is normalizeText-trimmed upstream, but the regex + // tolerates trailing whitespace so this can't silently regress. + ["Foo ry (0202113-9) ", "0202113-9"], + // Multiple parens: only the trailing y-tunnus is extracted, not a + // mid-string lookalike. + ["Org with two ids (1234567-8) (0202113-9)", "0202113-9"], + ])("extracts y-tunnus from %p", (input, expected) => { + expect(extractBusinessId(input)).toBe(expected); + }); + + it.each([ + ["Bare individual name"], + [""], + // Wrong digit counts → not a valid y-tunnus shape. + ["Wrong format (12345-67)"], + ["Wrong format (12345678-9)"], + // Not end-anchored: a mid-string y-tunnus without the trailing form is + // intentionally NOT extracted. + ["Mid-string (0202113-9) extra text"], + ])("returns null for %p", (input) => { + expect(extractBusinessId(input)).toBeNull(); + }); + + it("returns null for null input", () => { + expect(extractBusinessId(null)).toBeNull(); + }); +}); diff --git a/src/cli/grants-explorer/utils/business-id.ts b/src/cli/grants-explorer/utils/business-id.ts new file mode 100644 index 0000000..e38ff0a --- /dev/null +++ b/src/cli/grants-explorer/utils/business-id.ts @@ -0,0 +1,16 @@ +// Finnish Business ID (y-tunnus): exactly 7 digits, hyphen, 1 check digit. +// In paatokset.xlsx it appears as a trailing parenthetical suffix on the +// Saajan nimi column, e.g. "Suomen elokuvasäätiö sr (0202113-9)". +// +// End-anchored on purpose: only the trailing form is canonical. Mid-string +// matches would risk false positives if any future cell text contains a +// digit-hyphen-digit run, and missing values for those rows are surfaced via +// the loader's unmatched-count log rather than guessed at. +const Y_TUNNUS_TRAILING = /\((\d{7}-\d)\)\s*$/; + +export const extractBusinessId = (recipient: string | null): string | null => { + if (!recipient) { + return null; + } + return Y_TUNNUS_TRAILING.exec(recipient)?.[1] ?? null; +}; From ea0286904944977038104668e28473dc3b468b87 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Sun, 31 May 2026 10:54:49 +0300 Subject: [PATCH 4/5] feat: load every sektoriluokitus and emit combined grants.json --- src/cli/grants-explorer/README.md | 96 +-- .../clients/combined-writer.test.ts | 105 ++++ .../clients/combined-writer.ts | 42 ++ .../grants-explorer/clients/database.test.ts | 22 + src/cli/grants-explorer/clients/database.ts | 14 +- .../grants-explorer/clients/manifest.test.ts | 55 ++ src/cli/grants-explorer/clients/manifest.ts | 35 ++ .../clients/xlsx-downloader.test.ts | 200 +++++-- .../clients/xlsx-downloader.ts | 554 ++++++++++++++---- .../clients/xlsx-loader.test.ts | 27 +- .../grants-explorer/clients/xlsx-loader.ts | 14 +- src/cli/grants-explorer/constants.ts | 7 +- src/cli/grants-explorer/main.ts | 65 +- src/cli/grants-explorer/tools/sql-tool.ts | 29 +- src/cli/grants-explorer/types/schemas.test.ts | 8 +- src/cli/grants-explorer/types/schemas.ts | 24 +- .../utils/sector-parser.test.ts | 54 ++ .../grants-explorer/utils/sector-parser.ts | 40 ++ 18 files changed, 1154 insertions(+), 237 deletions(-) create mode 100644 src/cli/grants-explorer/clients/combined-writer.test.ts create mode 100644 src/cli/grants-explorer/clients/combined-writer.ts create mode 100644 src/cli/grants-explorer/clients/manifest.test.ts create mode 100644 src/cli/grants-explorer/clients/manifest.ts create mode 100644 src/cli/grants-explorer/utils/sector-parser.test.ts create mode 100644 src/cli/grants-explorer/utils/sector-parser.ts diff --git a/src/cli/grants-explorer/README.md b/src/cli/grants-explorer/README.md index 752f6ce..479e0a3 100644 --- a/src/cli/grants-explorer/README.md +++ b/src/cli/grants-explorer/README.md @@ -1,54 +1,76 @@ # Grants Explorer -Loads the Finnish grant-decisions workbook at `tmp/paatokset.xlsx` into an in-memory SQLite database and answers natural-language questions via an OpenAI agent that has a read-only `query_grants` SQL tool. +Loads the Finnish grant-decisions workbook into an in-memory SQLite database and answers natural-language questions via an OpenAI agent that has a read-only `query_grants` SQL tool. The dataset spans every Sektoriluokitus (Finnish institutional sector classification) — one xlsx is fetched per sector and tagged at load. ## Run ``` pnpm run:grants-explorer -pnpm run:grants-explorer --file=tmp/paatokset.xlsx +pnpm run:grants-explorer --dir=tmp/grants-explorer/paatokset pnpm run:grants-explorer --refetch ``` ## Arguments -- `--file` (optional): path to the xlsx workbook. Defaults to `tmp/paatokset.xlsx`. -- `--refetch` (optional, presence-only flag): force-download the latest xlsx from [tutkihallintoa.fi](https://www.tutkihallintoa.fi/valtionavustukset/tutkiavustuksia/) before loading. Without the flag, the CLI uses the local file if present and auto-downloads only when it's missing. Pass it bare (`--refetch`) to enable; omit it to disable. Any explicit value (`--refetch=false`, `--refetch=true`, …) is rejected by the schema. +- `--dir` (optional): path to the per-sector xlsx directory. Defaults to `tmp/grants-explorer/paatokset`. +- `--refetch` (optional, presence-only flag): re-fetch from [tutkihallintoa.fi](https://www.tutkihallintoa.fi/valtionavustukset/tutkiavustuksia/) before loading. The fetch is resume-friendly: any per-sector `.xlsx` already on disk and parseable is skipped, so re-running after a mid-loop failure only fetches the missing sectors. Pass it bare (`--refetch`) to enable; omit to load from cache. Any explicit value (`--refetch=false`, `--refetch=true`, …) is rejected by the schema. To force a full re-download, delete the directory: `rm -rf tmp/grants-explorer/paatokset && pnpm run:grants-explorer --refetch`. + +Without `--refetch`, the CLI auto-fetches only when the `sectors.json` manifest is missing. + +## Cache layout + +``` +tmp/grants-explorer/ +├── grants.json # combined dataset: a single JSON array of every GrantRow +└── paatokset/ + ├── sectors.json # manifest: [{ code: "S11", label: "Yritykset" }, …] + ├── S11.xlsx + ├── S12.xlsx + ├── … + └── S15.xlsx +``` + +The manifest is written **only after every sector finishes downloading**. A missing manifest therefore signals an incomplete cache, regardless of how many `.xlsx` files are present. + +`grants.json` is rewritten after every successful load (atomic temp + rename). It mirrors the in-memory dataset 1:1 with snake_case field names and is the recommended artifact for downstream tools (jq, duckdb, pandas). The per-sector `.xlsx` files stay alongside because the downloader uses them for resume-after-failure semantics — `grants.json` is a derived artifact, not a replacement cache. ## Source data -`paatokset.xlsx` is downloaded from the Tutkiavustuksia.fi Power BI report, pre-filtered to: +xlsx files are downloaded from the Tutkiavustuksia.fi Power BI report, tab **Avustusasiat**, **"Myönteiset päätökset"** (positive grant decisions) table. The Sektoriluokitus slicer is iterated through every option discovered live in the report — including the `(Tyhjä)` and `Sektoriluokitus puuttuu` buckets — so the per-sector exports form a complete partition of all positive decisions. After download, the loader reconciles the summed row count against the report's "Myönteiset avustuspäätökset" headline and warns on a shortfall. Each value is exported separately because Power BI caps a single export at 150 000 rows. -- Tab: **Avustusasiat** -- Slicer: **Sektoriluokitus = S15 Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt** (Non-profit institutions serving households) +Note: this is the _decisions_ view, not "Saapuneet hakemukset" (received applications, a larger superset that includes rejected/pending requests). -Other filter scopes (date ranges, other sectors, other tabs) are intentionally not exposed as CLI flags — broadening the scope would change which grants land in the SQL DB and invalidate any saved analyses. Filter per-query in SQL after load instead. +Other filter scopes (date ranges, other tabs) are intentionally not exposed as CLI flags — broadening the scope would change which grants land in the SQL DB and invalidate any saved analyses. Filter per-query in SQL after load instead. ## Table schema -| Column | Type | Source header | -| ----------------------- | ------- | -------------------------------------------------- | -| `decision_date` | TEXT | Päätös pvm (ISO date) | -| `recipient` | TEXT | Saajan nimi (full original string, incl. y-tunnus) | -| `recipient_business_id` | TEXT | Y-tunnus extracted from Saajan nimi (indexed) | -| `granting_authority` | TEXT | Myöntäjä | -| `case_number` | TEXT | Asianumero | -| `amount_applied` | INTEGER | Haettu (EUR, nullable) | -| `amount_granted` | INTEGER | Myönnetty (EUR, nullable) | -| `has_eu_funding` | INTEGER | EU-varat (0/1) | -| `purpose` | TEXT | Hyväksytty käyttötarkoitus | -| `programme` | TEXT | Haun nimi (asianumero) | -| `region` | TEXT | Alueet | +| Column | Type | Source header | +| ----------------------- | ------- | ---------------------------------------------------- | +| `decision_date` | TEXT | Päätös pvm (ISO date) | +| `recipient` | TEXT | Saajan nimi (full original string, incl. y-tunnus) | +| `recipient_business_id` | TEXT | Y-tunnus extracted from Saajan nimi (indexed) | +| `granting_authority` | TEXT | Myöntäjä | +| `case_number` | TEXT | Asianumero | +| `amount_applied` | INTEGER | Haettu (EUR, nullable) | +| `amount_granted` | INTEGER | Myönnetty (EUR, nullable) | +| `has_eu_funding` | INTEGER | EU-varat (0/1) | +| `purpose` | TEXT | Hyväksytty käyttötarkoitus | +| `programme` | TEXT | Haun nimi (asianumero) | +| `region` | TEXT | Alueet | +| `sektoriluokitus_code` | TEXT | Sektoriluokitus code (e.g. `S15`), NOT NULL, indexed | +| `sektoriluokitus_label` | TEXT | Sektoriluokitus human label, NOT NULL | `amount_applied` / `amount_granted` are nullable so an unknown amount stays distinguishable from a real `0 €` decision in aggregates. `recipient_business_id` is `NULL` for recipients that don't have a Finnish Business ID — private persons, foreign entities, and ad-hoc working groups. The loader logs the count of such rows under `recipientsWithoutBusinessId`. Use `recipient_business_id = ''` for indexed equality lookups and `GROUP BY recipient_business_id` to aggregate per legal entity. +`sektoriluokitus_code` and `sektoriluokitus_label` originate from the manifest, not the xlsx itself — every row of `.xlsx` is tagged with the matching manifest entry at load time. Codes are `S` + 1–6 digits (coarse like `S11` and deep like `S131311` coexist, since the source classifies at varying precision). Two sentinel codes cover the sector-less rows: `BLANK` (slicer `(Tyhjä)`, a null value) and `PUUTTUU` (the source's explicit `Sektoriluokitus puuttuu`). To reproduce the legacy NPISH-only view, filter `WHERE sektoriluokitus_code = 'S15'`; for classified-only analysis use `WHERE sektoriluokitus_code LIKE 'S%'`. + ## Example session ``` $ pnpm run:grants-explorer -Ask about Finnish grant decisions: How much has Lapin ELY-keskus granted in total? +Ask about Finnish grant decisions: How much has Lapin ELY-keskus granted in total across all sectors? [ANSWER] Lapin ELY-keskus has granted approximately X € across N decisions. ``` @@ -56,20 +78,28 @@ Ask about Finnish grant decisions: How much has Lapin ELY-keskus granted in tota ```mermaid flowchart TD - A["Start"] --> B["Parse --file"] - B --> C["XlsxLoader.load() → GrantRow[]"] - C --> D["GrantsDatabase :memory: INSERT"] - D --> E["AgentRunner with query_grants tool"] - E --> F{"User question?"} - F -->|"yes"| G["Agent runs SQL via tool"] - G --> H{"status"} - H -->|"final"| I["Print answer"] - H -->|"needs_clarification"| F - F -->|"empty"| J["Done"] - I --> J + A["Start"] --> B["Parse --dir / --refetch"] + B --> C{"manifest present and not --refetch?"} + C -->|"no"| D["XlsxDownloader: discover sectors, fetch each, write manifest"] + C -->|"yes"| E["Read manifest"] + D --> E + E --> F["For each sector: XlsxLoader.load() → GrantRow[] with sektoriluokitus tag"] + F --> G["GrantsDatabase :memory: INSERT"] + G --> H["AgentRunner with query_grants tool"] + H --> I{"User question?"} + I -->|"yes"| J["Agent runs SQL via tool"] + J --> K{"status"} + K -->|"final"| L["Print answer"] + K -->|"needs_clarification"| I + I -->|"empty"| M["Done"] + L --> M ``` ## Notes - `xlsx` (SheetJS) is used because the source workbook omits the optional cell `r` (reference) attribute and uses an unusual `x:` element-namespace prefix; `read-excel-file` and `exceljs` both rejected this layout in testing. - `paatos_pvm` cells arrive as raw Excel serial numbers (date styling without the `t="d"` cell type), so the loader explicitly converts via `XLSX.SSF.parse_date_code`. +- The Sektoriluokitus slicer is virtualized. To enumerate every option the downloader opens the dropdown and walks the listbox via keyboard `ArrowDown`, reading the focused option's text each step. Power BI auto-scrolls the focused row into view, which is more robust than guessing a scroll-container CSS class. A `MIN_EXPECTED_SECTORS` guard aborts the run with a debug screenshot if discovery returns fewer sectors than expected, and the post-download headline reconciliation catches a partial miss that still clears the guard. +- Selecting a filter: real S-codes use search-then-click (type the code into "Hae", click the row whose text starts with `" "`; the trailing space stops `S1313` from also matching `S131311`). The two sentinel buckets `(Tyhjä)` and `Sektoriluokitus puuttuu` use keyboard-nav selection — ArrowDown until `document.activeElement.innerText` equals the target label, then click the focused row. Playwright's substring text locators (`getByText("(Tyhjä)")`) proved unreliable here: the row's whole-element text doesn't normalize to the bare label. +- The slicer dropdown is always closed via `Escape` in a `try/finally` around the selection block. If selection throws and the dropdown stays open, the next sector's `dropdown.click()` would toggle it shut instead of opening it — and the subsequent 'Hae' visibility check would fail, aborting the whole run. +- The slicer is assumed to be single-select; clicking sector S12 after S11 deselects S11 automatically. The per-sector zero-rows assertion in `downloadOneSector` will surface a regression to multi-select (every export after the first would be empty). diff --git a/src/cli/grants-explorer/clients/combined-writer.test.ts b/src/cli/grants-explorer/clients/combined-writer.test.ts new file mode 100644 index 0000000..f15a474 --- /dev/null +++ b/src/cli/grants-explorer/clients/combined-writer.test.ts @@ -0,0 +1,105 @@ +import { mkdtemp, readdir, readFile, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { Logger } from "~clients/logger"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { z } from "zod"; + +import { GrantRowSchema } from "../types/schemas"; +import type { GrantRow } from "../types/schemas"; +import { writeCombinedGrants } from "./combined-writer"; + +const silentLogger = new Logger({ + level: "error", + useColors: false, + useTimestamps: false, +}); + +const sampleRows: GrantRow[] = [ + { + decision_date: "2024-01-15", + recipient: "Foo ry (1234567-8)", + recipient_business_id: "1234567-8", + granting_authority: "Lapin ELY-keskus", + case_number: "L-001", + amount_applied: 100_000, + amount_granted: 80_000, + has_eu_funding: 1, + purpose: "Test purpose", + programme: "Test programme (key-1)", + region: "Lappi", + sektoriluokitus_code: "S15", + sektoriluokitus_label: + "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", + }, + { + // Every nullable field null — pins that NULLs round-trip as JSON null, + // not undefined / missing keys. + decision_date: null, + recipient: null, + recipient_business_id: null, + granting_authority: null, + case_number: null, + amount_applied: null, + amount_granted: null, + has_eu_funding: 0, + purpose: null, + programme: null, + region: null, + sektoriluokitus_code: "BLANK", + sektoriluokitus_label: "(Tyhjä)", + }, + { + // Sentinel sector code on the explicit "missing" bucket. + decision_date: "2025-03-01", + recipient: "Working group (no business id)", + recipient_business_id: null, + granting_authority: "Pohjois-Pohjanmaan ELY-keskus", + case_number: "P-042", + amount_applied: 50_000, + amount_granted: 50_000, + has_eu_funding: 1, + purpose: "Another purpose", + programme: "Another programme", + region: "Pohjois-Pohjanmaa", + sektoriluokitus_code: "PUUTTUU", + sektoriluokitus_label: "Sektoriluokitus puuttuu", + }, +]; + +describe("writeCombinedGrants", () => { + let dir: string; + + beforeEach(async () => { + dir = await mkdtemp(join(tmpdir(), "combined-writer-test-")); + }); + + afterEach(async () => { + await rm(dir, { recursive: true, force: true }); + }); + + it("round-trips GrantRow[] through write + parse with schema-valid output", async () => { + const path = join(dir, "grants.json"); + await writeCombinedGrants({ logger: silentLogger, path, rows: sampleRows }); + + const raw = await readFile(path, "utf8"); + const parsed: unknown = JSON.parse(raw); + const validated = z.array(GrantRowSchema).parse(parsed); + + expect(validated).toEqual(sampleRows); + // Nullable fields must survive the round-trip as JSON null (not stripped + // to undefined / missing keys), or downstream tools relying on positional + // shape would silently see different columns per row. + expect(validated[1]?.amount_granted).toBeNull(); + expect(validated[1]?.recipient_business_id).toBeNull(); + }); + + it("renames the temp file atomically (no .tmp-* leftovers after success)", async () => { + const path = join(dir, "grants.json"); + await writeCombinedGrants({ logger: silentLogger, path, rows: sampleRows }); + + const entries = await readdir(dir); + expect(entries).toEqual(["grants.json"]); + expect(entries.some((f) => f.includes(".tmp-"))).toBe(false); + }); +}); diff --git a/src/cli/grants-explorer/clients/combined-writer.ts b/src/cli/grants-explorer/clients/combined-writer.ts new file mode 100644 index 0000000..c437d84 --- /dev/null +++ b/src/cli/grants-explorer/clients/combined-writer.ts @@ -0,0 +1,42 @@ +import { randomUUID } from "node:crypto"; +import { rename, unlink, writeFile } from "node:fs/promises"; +import type { Logger } from "~clients/logger"; + +import type { GrantRow } from "../types/schemas"; + +export type WriteCombinedGrantsOptions = { + logger: Logger; + path: string; + rows: GrantRow[]; +}; + +/** + * Serialize the full GrantRow[] to a single pretty-printed JSON array at + * `path`. Writes to a sibling temp file first and renames into place so a + * crash mid-write never leaves a half-written 50 MB blob at the canonical + * path (which would silently look "complete" to downstream tools). + * + * No schema validation here — rows come straight out of XlsxLoader which + * already validates via GrantRowSchema; re-validating 150k rows on write + * would just burn CPU. The round-trip test pins the on-disk shape. + */ +export const writeCombinedGrants = async ({ + logger, + path, + rows, +}: WriteCombinedGrantsOptions): Promise => { + const tempPath = `${path}.tmp-${randomUUID().slice(0, 8)}`; + try { + await writeFile(tempPath, `${JSON.stringify(rows, null, 2)}\n`, "utf8"); + await rename(tempPath, path); + logger.info("Wrote combined grants JSON", { + path, + rowCount: rows.length, + }); + } catch (error) { + await unlink(tempPath).catch(() => { + // intentional: temp may not exist yet (failure before writeFile) + }); + throw error; + } +}; diff --git a/src/cli/grants-explorer/clients/database.test.ts b/src/cli/grants-explorer/clients/database.test.ts index 090a104..5518114 100644 --- a/src/cli/grants-explorer/clients/database.test.ts +++ b/src/cli/grants-explorer/clients/database.test.ts @@ -22,6 +22,9 @@ const row = (overrides: Partial = {}): GrantRow => ({ purpose: "Test purpose", programme: "Test programme", region: "Test region", + sektoriluokitus_code: "S15", + sektoriluokitus_label: + "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", ...overrides, }); @@ -132,4 +135,23 @@ describe("GrantsDatabase", () => { db.insertRows([row({ has_eu_funding: 2 as unknown as 0 | 1 })]); }).toThrow(); }); + + it("filters by sektoriluokitus_code via indexed equality", () => { + db.insertRows([ + row({ sektoriluokitus_code: "S15", sektoriluokitus_label: "NPISH" }), + row({ sektoriluokitus_code: "S11", sektoriluokitus_label: "Yritykset" }), + row({ sektoriluokitus_code: "S11", sektoriluokitus_label: "Yritykset" }), + ]); + const result = db.queryOne<{ n: number }>( + "SELECT COUNT(*) as n FROM grants WHERE sektoriluokitus_code = ?", + ["S11"] + ); + expect(result?.n).toBe(2); + }); + + it("rejects rows with NULL sektoriluokitus_code (schema NOT NULL)", () => { + expect(() => { + db.insertRows([row({ sektoriluokitus_code: null as unknown as string })]); + }).toThrow(); + }); }); diff --git a/src/cli/grants-explorer/clients/database.ts b/src/cli/grants-explorer/clients/database.ts index 77c515a..7ee8cf5 100644 --- a/src/cli/grants-explorer/clients/database.ts +++ b/src/cli/grants-explorer/clients/database.ts @@ -34,7 +34,9 @@ export class GrantsDatabase { has_eu_funding INTEGER NOT NULL CHECK (has_eu_funding IN (0, 1)), purpose TEXT, programme TEXT, - region TEXT + region TEXT, + sektoriluokitus_code TEXT NOT NULL, + sektoriluokitus_label TEXT NOT NULL ); CREATE INDEX idx_grants_granting_authority ON grants(granting_authority); @@ -42,6 +44,7 @@ export class GrantsDatabase { CREATE INDEX idx_grants_region ON grants(region); CREATE INDEX idx_grants_decision_date ON grants(decision_date); CREATE INDEX idx_grants_has_eu_funding ON grants(has_eu_funding); + CREATE INDEX idx_grants_sektoriluokitus_code ON grants(sektoriluokitus_code); `); this.logger.debug("Grants schema created"); } @@ -51,8 +54,9 @@ export class GrantsDatabase { INSERT INTO grants ( decision_date, recipient, recipient_business_id, granting_authority, case_number, amount_applied, amount_granted, has_eu_funding, - purpose, programme, region - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + purpose, programme, region, + sektoriluokitus_code, sektoriluokitus_label + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `); this.db.exec("BEGIN"); @@ -69,7 +73,9 @@ export class GrantsDatabase { row.has_eu_funding, row.purpose, row.programme, - row.region + row.region, + row.sektoriluokitus_code, + row.sektoriluokitus_label ); } this.db.exec("COMMIT"); diff --git a/src/cli/grants-explorer/clients/manifest.test.ts b/src/cli/grants-explorer/clients/manifest.test.ts new file mode 100644 index 0000000..37496cf --- /dev/null +++ b/src/cli/grants-explorer/clients/manifest.test.ts @@ -0,0 +1,55 @@ +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { MANIFEST_FILE } from "../constants"; +import { readManifest, writeManifest } from "./manifest"; + +describe("manifest", () => { + let dir: string; + + beforeEach(async () => { + dir = await mkdtemp(join(tmpdir(), "grants-manifest-test-")); + }); + + afterEach(async () => { + await rm(dir, { recursive: true, force: true }); + }); + + it("round-trips a manifest through writeManifest + readManifest", async () => { + const manifest = [ + { code: "S11", label: "Yritykset" }, + { + code: "S15", + label: "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", + }, + ]; + await writeManifest(dir, manifest); + expect(await readManifest(dir)).toEqual(manifest); + }); + + it("rejects an empty manifest (must contain at least one sector)", async () => { + await expect(writeManifest(dir, [])).rejects.toThrow(); + }); + + it("rejects a manifest with a malformed code", async () => { + await expect( + writeManifest(dir, [{ code: "X11", label: "Bad" }]) + ).rejects.toThrow(); + }); + + it("readManifest throws on non-JSON contents", async () => { + await writeFile(join(dir, MANIFEST_FILE), "not json{", "utf8"); + await expect(readManifest(dir)).rejects.toThrow(); + }); + + it("readManifest throws on JSON that doesn't match the schema", async () => { + await writeFile( + join(dir, MANIFEST_FILE), + JSON.stringify([{ code: "S15" /* missing label */ }]), + "utf8" + ); + await expect(readManifest(dir)).rejects.toThrow(); + }); +}); diff --git a/src/cli/grants-explorer/clients/manifest.ts b/src/cli/grants-explorer/clients/manifest.ts new file mode 100644 index 0000000..4cac728 --- /dev/null +++ b/src/cli/grants-explorer/clients/manifest.ts @@ -0,0 +1,35 @@ +import { readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; + +import { MANIFEST_FILE } from "../constants"; +import type { SectorManifest } from "../types/schemas"; +import { SectorManifestSchema } from "../types/schemas"; + +const manifestPath = (dir: string): string => join(dir, MANIFEST_FILE); + +/** + * Reads and validates `/sectors.json`. Throws on missing file, malformed + * JSON, or shape that doesn't match SectorManifestSchema — the cached state + * is opaque without a valid manifest, so we'd rather fail loudly than load a + * silently-incomplete dataset. + */ +export const readManifest = async (dir: string): Promise => { + const raw = await readFile(manifestPath(dir), "utf8"); + const parsed: unknown = JSON.parse(raw); + return SectorManifestSchema.parse(parsed); +}; + +export const writeManifest = async ( + dir: string, + manifest: SectorManifest +): Promise => { + // Validate before writing so a corrupted in-memory list can't poison the + // on-disk cache. Pretty-print for easier diffing when the source upstream + // adds a new sektoriluokitus code. + const validated = SectorManifestSchema.parse(manifest); + await writeFile( + manifestPath(dir), + `${JSON.stringify(validated, null, 2)}\n`, + "utf8" + ); +}; diff --git a/src/cli/grants-explorer/clients/xlsx-downloader.test.ts b/src/cli/grants-explorer/clients/xlsx-downloader.test.ts index e40f2b2..c61ba95 100644 --- a/src/cli/grants-explorer/clients/xlsx-downloader.test.ts +++ b/src/cli/grants-explorer/clients/xlsx-downloader.test.ts @@ -1,18 +1,13 @@ -import { - mkdtemp, - readdir, - readFile, - rm, - stat, - writeFile, -} from "node:fs/promises"; +import { mkdtemp, readdir, readFile, rm, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; -import { dirname, join } from "node:path"; +import { join } from "node:path"; import { Logger } from "~clients/logger"; import type { Mock } from "vitest"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import XLSX from "xlsx"; +import { MANIFEST_FILE } from "../constants"; + // === Test fixture: a tiny valid xlsx with the schema the downloader's // post-download validation step expects (XlsxLoader). One header row + one // data row is enough to satisfy "rows.length > 0". ========================== @@ -55,14 +50,13 @@ type Behavior = { failTabClick?: boolean; produceInvalidXlsx?: boolean; fixture: Buffer; + discoveredSectorTexts: string[]; }; let closeMock: Mock; const resolved = vi.fn(() => Promise.resolve()); -// A chainable noop locator/handle. Most methods return the same object so -// `.first().click()`, `.filter({...}).first().click()`, etc. all resolve. type ChainOverrides = { click?: Mock; }; @@ -94,15 +88,46 @@ const buildPlaywrightMock = (behavior: Behavior) => { return Promise.resolve(); }); + // The downloader makes two flavors of `frame.evaluate(...)`: + // 1. Readiness check (returns boolean — anything truthy works here) + // 2. activeElement focused-text read (used by both discoverSectors and + // the keyboard-nav selection path for BLANK/PUUTTUU) + // We dispatch on the function source. The focused-text read is identified + // by its `document.activeElement` reference; everything else falls through + // to the readiness branch. + // + // `focusIndex` walks `discoveredSectorTexts` and resets to 0 every time the + // slicer dropdown is (re-)opened — that mirrors the real listbox restarting + // its focus at the top when the dropdown re-opens, so both the initial + // discovery walk AND each per-sector selection walk see a fresh sequence. + // Once the list is exhausted, the index sticks at the end so the + // stable-threshold loop can terminate. + let focusIndex = 0; + const evaluate = vi.fn((fn: unknown): Promise => { + const src = typeof fn === "function" ? fn.toString() : String(fn); + if (src.includes("activeElement")) { + const texts = behavior.discoveredSectorTexts; + const text = texts[focusIndex] ?? texts[texts.length - 1] ?? ""; + if (focusIndex < texts.length) { + focusIndex++; + } + return Promise.resolve(text); + } + return Promise.resolve(true); + }); + + const slicerDropdownClick = vi.fn(() => { + focusIndex = 0; + return Promise.resolve(); + }); + const frameMock = { url: () => "https://app.powerbi.com/reportEmbed?reportId=demo", - evaluate: vi.fn(() => Promise.resolve(true)), + evaluate, waitForTimeout: resolved, evaluateHandle: vi.fn(() => Promise.resolve({ asElement: () => ({ - // Used by both the slicer-search input and the Myönteiset table - // visualContainer in the downloader's call sequence. click: () => Promise.resolve(), boundingBox: () => Promise.resolve({ x: 100, y: 200, width: 800, height: 400 }), @@ -118,14 +143,19 @@ const buildPlaywrightMock = (behavior: Behavior) => { click: isAvustusasiat ? tabClick : resolved, }); }), - locator: vi.fn(() => makeChainable()), - getByRole: vi.fn(() => - makeChainable({ - // The optional "confirm export" dialog isn't shown in tests; the - // downloader catches this rejection and continues. - click: vi.fn(() => Promise.reject(new Error("no confirmation dialog"))), - }) - ), + locator: vi.fn((selector?: string) => { + // Re-opening the slicer dropdown resets the listbox focus to the first + // option in the real UI. Mirror that here so each per-sector selection + // walk sees a fresh sequence from the top of `discoveredSectorTexts`. + if ( + typeof selector === "string" && + selector.includes("Sektoriluokitus") + ) { + return makeChainable({ click: slicerDropdownClick }); + } + return makeChainable(); + }), + getByRole: vi.fn(() => makeChainable()), }; const downloadMock = { @@ -170,7 +200,6 @@ const buildPlaywrightMock = (behavior: Behavior) => { }; }; -// Module-scoped mock holders mutated by each test. let playwrightMock: ReturnType; vi.mock("playwright", () => ({ get chromium() { @@ -186,10 +215,12 @@ const silentLogger = new Logger({ describe("XlsxDownloader", () => { let workDir: string; + let destDir: string; let fixture: Buffer; beforeEach(async () => { workDir = await mkdtemp(join(tmpdir(), "xlsx-downloader-test-")); + destDir = join(workDir, "paatokset"); fixture = buildFixtureXlsx(); }); @@ -197,55 +228,136 @@ describe("XlsxDownloader", () => { await rm(workDir, { recursive: true, force: true }); }); - it("creates nested destination directories before writing", async () => { - playwrightMock = buildPlaywrightMock({ fixture }); + it("writes one xlsx per discovered sector (incl. deep codes + blank buckets) + a manifest", async () => { + playwrightMock = buildPlaywrightMock({ + fixture, + discoveredSectorTexts: [ + "Valitse kaikki", // select-all control — must be skipped + "(Tyhjä)", // blank bucket → BLANK + "S11 Yritykset", + "S131311 Kunnat", // 6-digit code the old \d{2,4} regex dropped + "S15 Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", + "Sektoriluokitus puuttuu", // missing bucket → PUUTTUU + ], + }); const { XlsxDownloader } = await import("./xlsx-downloader"); - const dest = join(workDir, "nested", "deep", "paatokset.xlsx"); await new XlsxDownloader({ logger: silentLogger, sourceUrl: "https://example.invalid/source", - }).download(dest); + }).download(destDir); - const info = await stat(dest); - expect(info.size).toBeGreaterThan(0); + const files = (await readdir(destDir)).sort(); + expect(files).toEqual([ + "BLANK.xlsx", + "PUUTTUU.xlsx", + "S11.xlsx", + "S131311.xlsx", + "S15.xlsx", + MANIFEST_FILE, + ]); + + const manifest = JSON.parse( + await readFile(join(destDir, MANIFEST_FILE), "utf8") + ) as { code: string; label: string }[]; + expect(manifest).toEqual([ + { code: "BLANK", label: "(Tyhjä)" }, + { code: "S11", label: "Yritykset" }, + { code: "S131311", label: "Kunnat" }, + { + code: "S15", + label: "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", + }, + { code: "PUUTTUU", label: "Sektoriluokitus puuttuu" }, + ]); + }); + + it("aborts when discovery returns fewer than MIN_EXPECTED_SECTORS sectors", async () => { + playwrightMock = buildPlaywrightMock({ + fixture, + discoveredSectorTexts: ["S15 Lonely"], + }); + const { XlsxDownloader } = await import("./xlsx-downloader"); + + await expect( + new XlsxDownloader({ + logger: silentLogger, + sourceUrl: "https://example.invalid/source", + }).download(destDir) + ).rejects.toThrow(/Discovered only 1 sektoriluokitus/); + + const files = await readdir(destDir); + expect(files).not.toContain(MANIFEST_FILE); + expect(files).not.toContain("S15.xlsx"); }); it("closes the browser when navigation throws", async () => { - playwrightMock = buildPlaywrightMock({ fixture, failTabClick: true }); + playwrightMock = buildPlaywrightMock({ + fixture, + failTabClick: true, + discoveredSectorTexts: ["S15 Whatever"], + }); const { XlsxDownloader } = await import("./xlsx-downloader"); - const dest = join(workDir, "paatokset.xlsx"); await expect( new XlsxDownloader({ logger: silentLogger, sourceUrl: "https://example.invalid/source", - }).download(dest) + }).download(destDir) ).rejects.toThrow(/tab click failure/); expect(closeMock).toHaveBeenCalledTimes(1); }); - it("preserves the existing destination file when validation fails", async () => { - playwrightMock = buildPlaywrightMock({ fixture, produceInvalidXlsx: true }); + it("skips sectors whose xlsx already exists and parses", async () => { + playwrightMock = buildPlaywrightMock({ + fixture, + discoveredSectorTexts: [ + "S11 Yritykset", + "S13 Julkisyhteisöt", + "S15 NPISH", + ], + }); const { XlsxDownloader } = await import("./xlsx-downloader"); - const dest = join(workDir, "paatokset.xlsx"); - // Pre-populate destination with the known-good fixture bytes. - await writeFile(dest, fixture); - const goodBytes = await readFile(dest); + // Pre-populate S11.xlsx with the valid fixture — the downloader should + // see it as cached and only fetch the remaining sectors. + const { mkdir } = await import("node:fs/promises"); + await mkdir(destDir, { recursive: true }); + await writeFile(join(destDir, "S11.xlsx"), fixture); + + await new XlsxDownloader({ + logger: silentLogger, + sourceUrl: "https://example.invalid/source", + }).download(destDir); + + const files = (await readdir(destDir)).sort(); + expect(files).toEqual(["S11.xlsx", "S13.xlsx", "S15.xlsx", MANIFEST_FILE]); + }); + + it("on validation failure: throws, manifest is not written, no temp leftovers", async () => { + playwrightMock = buildPlaywrightMock({ + fixture, + produceInvalidXlsx: true, + discoveredSectorTexts: [ + "S11 Yritykset", + "S13 Julkisyhteisöt", + "S15 NPISH", + ], + }); + const { XlsxDownloader } = await import("./xlsx-downloader"); await expect( new XlsxDownloader({ logger: silentLogger, sourceUrl: "https://example.invalid/source", - }).download(dest) + }).download(destDir) ).rejects.toThrow(); - const afterBytes = await readFile(dest); - expect(afterBytes.equals(goodBytes)).toBe(true); - - const siblings = await readdir(dirname(dest)); - expect(siblings.some((f) => f.includes(".tmp-"))).toBe(false); + const files = await readdir(destDir); + // No manifest because the run aborted before that step. + expect(files).not.toContain(MANIFEST_FILE); + // No leftover *.tmp-* siblings (cleanupTemp ran on failure). + expect(files.some((f) => f.includes(".tmp-"))).toBe(false); }); }); diff --git a/src/cli/grants-explorer/clients/xlsx-downloader.ts b/src/cli/grants-explorer/clients/xlsx-downloader.ts index 2777c4c..1531447 100644 --- a/src/cli/grants-explorer/clients/xlsx-downloader.ts +++ b/src/cli/grants-explorer/clients/xlsx-downloader.ts @@ -1,10 +1,20 @@ import { randomUUID } from "node:crypto"; +import { existsSync } from "node:fs"; import { mkdir, rename, unlink } from "node:fs/promises"; -import { dirname } from "node:path"; +import { join } from "node:path"; import type { Logger } from "~clients/logger"; import type { Browser, Frame, Page } from "playwright"; import { chromium } from "playwright"; +import type { Sector } from "../types/schemas"; +import { + BLANK_CODE, + BLANK_LABEL, + MISSING_CODE, + MISSING_LABEL, + parseSectorOption, +} from "../utils/sector-parser"; +import { writeManifest } from "./manifest"; import { XlsxLoader } from "./xlsx-loader"; // Selectors locked in from manual discovery on 2026-05-24 against @@ -14,18 +24,14 @@ import { XlsxLoader } from "./xlsx-loader"; // desktop variant (slicers + per-visual overflow menu). Power BI does not // expose tab/menuitem ARIA roles, so navigation uses text + class selectors. // -// Filter scope: this downloader reproduces the existing tmp/paatokset.xlsx — -// the "Avustusasiat" tab with the Sektoriluokitus slicer set to -// "S15 Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt". -// Other filter scopes are out of scope (see plan / README). +// Scope: this downloader iterates every Sektoriluokitus option exposed by the +// "Avustusasiat" slicer and downloads one xlsx per sector into the destination +// directory. Each file is written via a temp-name + rename so partial state is +// never visible at the final path. The sectors manifest is written last. const TAB_NAME = "Avustusasiat"; +const SLICER_ARIA_LABEL = "Sektoriluokituskoodi- ja nimi"; const SLICER_LABEL = "Sektoriluokitus"; -// S15 = "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt" -// (Non-profit institutions serving households). The slicer's search input is -// "Hae"; typing the prefix "S15" filters the virtualized list to a single -// match, which we then click. -const SECTOR_OPTION_PREFIX = "S15"; const POWERBI_IFRAME_SELECTOR = 'iframe[src*="powerbi"]'; const PBI_OVERFLOW_BTN_SELECTOR = ".vcMenuBtn"; @@ -43,22 +49,43 @@ const REPORT_READY_TIMEOUT_MS = 90_000; const DOWNLOAD_TIMEOUT_MS = 120_000; const DEBUG_DIR = "tmp/grants-explorer/debug"; +// Discovery: walk the slicer listbox via keyboard navigation (ArrowDown), reading +// the focused option's text each step. Power BI auto-scrolls the focused option +// into view, which is more robust than guessing a scroll-container CSS class. +// MAX_STEPS is the safety cap (well above any realistic sektoriluokitus count); +// STABLE_THRESHOLD is the number of consecutive ArrowDown presses that produce +// the same focused text before we declare "end of list". +const KEYBOARD_NAV_MAX_STEPS = 200; +// Tolerance for transient repeats: Power BI sometimes lags the activeElement +// update by more than one keyboard step, so 3 was tripping the early-exit on +// healthy lists and dropping rows mid-scroll. 10 gives generous headroom while +// MAX_STEPS still caps total runtime. +const KEYBOARD_NAV_STABLE_THRESHOLD = 10; +// Wait between ArrowDown and the next read. Empirically 80 ms was too tight +// (focus hadn't updated yet, so the next read saw the same row); 150 ms +// settles reliably without making a full 200-step walk feel slow. +const KEYBOARD_NAV_STEP_MS = 150; +// Sanity guard: the source dataset exposes the full institutional-sector +// classification (S11..S15 plus sub-codes). Anything below this is almost +// certainly a discovery regression — surface loudly rather than ship a partial +// manifest. +const MIN_EXPECTED_SECTORS = 3; + export type XlsxDownloaderOptions = { logger: Logger; sourceUrl: string; }; /** - * Downloads the Avustusasiat (S15) grants xlsx from tutkihallintoa.fi via - * Power BI's per-visual Export menu, atomically replacing the destination - * only if the downloaded file parses successfully through XlsxLoader. + * Downloads one xlsx per Sektoriluokitus from tutkihallintoa.fi. * * Contract: - * - Never leaves the destination in a partial state. Writes to a sibling - * temp file, validates by parsing, then renames over destination. - * - On any failure, removes the temp file and leaves destination untouched. - * - Always closes the browser (finally), so a hung selector doesn't leak - * a Chromium process. + * - Per-sector atomicity: each `.xlsx` is written via a sibling temp + * file and only renamed into place after the xlsx parses successfully. + * - Resume-friendly: sectors whose final file already exists and parses + * are skipped, so re-running after a mid-loop failure only fetches the + * missing ones. The manifest is written only after every sector lands. + * - On any sector failure, throws; the browser is always closed in finally. */ export class XlsxDownloader { private logger: Logger; @@ -69,13 +96,12 @@ export class XlsxDownloader { this.sourceUrl = sourceUrl; } - async download(destinationPath: string): Promise { - await mkdir(dirname(destinationPath), { recursive: true }); - const tempPath = `${destinationPath}.tmp-${Date.now()}-${randomUUID().slice(0, 8)}`; + async download(destDir: string): Promise { + await mkdir(destDir, { recursive: true }); - this.logger.info("Downloading paatokset.xlsx", { + this.logger.info("Starting multi-sector grants download", { sourceUrl: this.sourceUrl, - tempPath, + destDir, }); let browser: Browser | null = null; @@ -88,9 +114,8 @@ export class XlsxDownloader { locale: "fi-FI", // 1100px viewport: Power BI's hover-to-reveal chrome only triggers // when the visual being hovered fits within the viewport scroll - // window. A taller viewport breaks the hover detection. The - // export-options dialog button sits outside the viewport in this - // mode, so we confirm via keyboard Enter (the button is autofocus). + // window. The export-options dialog button sits outside the viewport + // in this mode, so we confirm via DOM .click(). viewport: { width: 1600, height: 1100 }, }); page = await context.newPage(); @@ -117,26 +142,92 @@ export class XlsxDownloader { step = "tab-click"; await this.clickAvustusasiatTab(report); - step = "slicer-filter"; - await this.applySectorFilter(report, page); - - step = "open-export"; - const download = await this.openExportAndDownload(report, page); + // Best-effort: read the report's headline "positive decisions" count so + // we can reconcile it against the sum of per-sector rows after download. + const headlineTotal = await this.readDecisionHeadline(report); - step = "save"; - await download.saveAs(tempPath); - this.logger.info("Saved download to temp path", { tempPath }); + step = "discover-sectors"; + const sectors = await this.discoverSectors(report, page); + this.logger.info("Discovered sektoriluokitus options", { + count: sectors.length, + codes: sectors.map((s) => s.code), + }); + if (sectors.length < MIN_EXPECTED_SECTORS) { + await this.debugSnapshot(page, "discover-sectors-undersize"); + throw new Error( + `Discovered only ${sectors.length} sektoriluokitus option(s); expected at least ${MIN_EXPECTED_SECTORS}. ` + + `See ${DEBUG_DIR}/discover-sectors-undersize-*.png for the live DOM.` + ); + } - step = "validate"; - const rows = new XlsxLoader({ logger: this.logger }).load(tempPath); - if (rows.length === 0) { - throw new Error("Downloaded xlsx parsed to 0 rows"); + let loadedRows = 0; + const obtained: Sector[] = []; + for (const sector of sectors) { + const destPath = join(destDir, `${sector.code}.xlsx`); + const cachedRows = this.cachedRowCount(destPath, sector); + if (cachedRows !== null) { + this.logger.info("Sector xlsx already present; skipping", { + code: sector.code, + destPath, + rows: cachedRows, + }); + loadedRows += cachedRows; + obtained.push(sector); + continue; + } + step = `sector-${sector.code}`; + try { + loadedRows += await this.downloadOneSector( + report, + page, + sector, + destDir + ); + obtained.push(sector); + } catch (error) { + // The two non-sector buckets (BLANK/PUUTTUU) are selected via a more + // fragile path than real S-codes. If one fails, skip it rather than + // discarding the whole run — the S-codes are the bulk, and the + // reconciliation delta will reflect the missing bucket. + if (sector.code === BLANK_CODE || sector.code === MISSING_CODE) { + this.logger.warn("Skipping non-sector bucket after failure", { + code: sector.code, + error: error instanceof Error ? error.message : String(error), + }); + continue; + } + throw error; + } } - this.logger.info("Validated download", { tempPath, rows: rows.length }); - step = "rename"; - await rename(tempPath, destinationPath); - this.logger.info("Refetched paatokset.xlsx", { destinationPath }); + step = "write-manifest"; + await writeManifest(destDir, obtained); + this.logger.info("Wrote sectors manifest", { + destDir, + sectors: obtained.length, + }); + + // Reconcile: the per-sector rows are a partition of all positive + // decisions, so their sum should match the report headline. A shortfall + // means we missed a sector (or a bucket); we warn rather than throw so a + // long download isn't discarded — the manifest is already written and + // the gap is visible for follow-up. + if (headlineTotal !== null) { + const delta = headlineTotal - loadedRows; + const log = + Math.abs(delta) > Math.max(50, headlineTotal * 0.01) + ? this.logger.warn.bind(this.logger) + : this.logger.info.bind(this.logger); + log("Reconciled sector rows against report headline", { + headlineTotal, + loadedRows, + delta, + }); + } else { + this.logger.info("Loaded sector rows (headline unavailable)", { + loadedRows, + }); + } } catch (error) { this.logger.error("XlsxDownloader failed", { step, @@ -145,7 +236,6 @@ export class XlsxDownloader { if (page) { await this.debugSnapshot(page, step); } - await this.cleanupTemp(tempPath); throw error; } finally { if (browser) { @@ -154,6 +244,93 @@ export class XlsxDownloader { } } + /** + * Row count of an already-present, parseable sector xlsx, or null when the + * file is missing / empty / unparseable (i.e. it must be (re)downloaded). + */ + private cachedRowCount(destPath: string, sector: Sector): number | null { + if (!existsSync(destPath)) { + return null; + } + try { + const rows = new XlsxLoader({ logger: this.logger }).load(destPath, { + sector, + }); + return rows.length > 0 ? rows.length : null; + } catch (error) { + this.logger.warn("Cached sector xlsx failed to parse; will re-download", { + destPath, + error: error instanceof Error ? error.message : String(error), + }); + return null; + } + } + + /** + * Best-effort scrape of the "Myönteiset/Myönnetyt avustuspäätökset" headline + * count (Finnish thousands separators are spaces / nbsp). Returns null if the + * card text isn't found — reconciliation is advisory, never fatal. + */ + private async readDecisionHeadline(report: Frame): Promise { + try { + const bodyText = await report.evaluate(() => document.body.innerText); + const match = /Myön\w+\s+avustuspäätökset\s*([\d\u00a0 ]+)/.exec( + bodyText + ); + const digits = match?.[1]?.replace(/\D/g, ""); + if (!digits) { + return null; + } + const n = Number(digits); + return Number.isFinite(n) && n > 0 ? n : null; + } catch { + return null; + } + } + + private async downloadOneSector( + report: Frame, + page: Page, + sector: Sector, + destDir: string + ): Promise { + const destPath = join(destDir, `${sector.code}.xlsx`); + const tempPath = `${destPath}.tmp-${Date.now()}-${randomUUID().slice(0, 8)}`; + + this.logger.info("Downloading sector xlsx", { + code: sector.code, + tempPath, + }); + + try { + await this.applySectorFilter(report, page, sector.code); + const download = await this.openExportAndDownload(report, page); + await download.saveAs(tempPath); + this.logger.debug("Saved sector download to temp path", { + code: sector.code, + tempPath, + }); + + const rows = new XlsxLoader({ logger: this.logger }).load(tempPath, { + sector, + }); + if (rows.length === 0) { + throw new Error(`Sector ${sector.code} parsed to 0 rows`); + } + + await rename(tempPath, destPath); + this.logger.info("Saved sector xlsx", { + code: sector.code, + destPath, + rows: rows.length, + }); + return rows.length; + } catch (error) { + await this.cleanupTemp(tempPath); + throw error; + } + } + private async waitForReportFrame(page: Page): Promise { const start = Date.now(); while (Date.now() - start < REPORT_READY_TIMEOUT_MS) { @@ -185,74 +362,231 @@ export class XlsxDownloader { const tab = report.getByText(TAB_NAME, { exact: true }).first(); await tab.waitFor({ state: "visible", timeout: 15_000 }); await tab.click({ timeout: 10_000 }); - // Wait for the page-switch to settle. await report.waitForTimeout(3000); this.logger.debug("Clicked report tab", { tab: TAB_NAME }); } - private async applySectorFilter(report: Frame, page: Page): Promise { - // The Sektoriluokitus slicer is a dropdown with aria-label - // "Sektoriluokituskoodi- ja nimi" (the underlying column name). It - // opens a virtualized list (~8 visible at a time) with a search input - // labeled "Hae" — typing into that input filters the list to matching - // items, which we then click. + /** + * Open the Sektoriluokitus slicer and walk every option via keyboard + * ArrowDown, reading the focused element's text on each step. Power BI's + * listbox auto-scrolls the focused row into view, which avoids relying on + * a fragile scroll-container CSS class. + */ + private async discoverSectors(report: Frame, page: Page): Promise { const dropdown = report - .locator('[aria-label="Sektoriluokituskoodi- ja nimi"]') + .locator(`[aria-label="${SLICER_ARIA_LABEL}"]`) .first(); await dropdown.waitFor({ state: "visible", timeout: 15_000 }); await dropdown.click({ timeout: 10_000 }); - this.logger.debug("Opened slicer dropdown", { slicer: SLICER_LABEL }); + this.logger.debug("Opened slicer dropdown for discovery", { + slicer: SLICER_LABEL, + }); await page.waitForTimeout(1500); - // Find the search input by enumerating all "Hae" inputs and picking the - // visible one (only one is visible at a time — the one inside the - // just-opened dropdown). - const searchHandle = await report.evaluateHandle(() => { - const inputs = [ - ...document.querySelectorAll( - 'input[aria-label="Hae"], input[placeholder="Hae"]' - ), - ]; - return ( - inputs.find((el) => { - const r = el.getBoundingClientRect(); - return r.width > 0 && r.height > 0; - }) ?? null - ); - }); - const searchEl = searchHandle.asElement(); - if (!searchEl) { - throw new Error( - "No visible 'Hae' search input found after opening slicer dropdown" - ); + const readFocused = (): Promise => + report.evaluate(() => { + const el = document.activeElement as HTMLElement | null; + if (!el) { + return ""; + } + // innerText is always a string but can be empty (e.g. icon-only rows); + // fall back to aria-label so checkbox-style options still report a name. + const text = el.innerText.trim(); + if (text) { + return text; + } + return (el.getAttribute("aria-label") ?? "").trim(); + }); + + // Seed: ArrowDown from the search input moves focus into the listbox onto + // its first option. Generous initial settle (focus crosses widget boundary). + await page.keyboard.press("ArrowDown"); + await page.waitForTimeout(KEYBOARD_NAV_STEP_MS * 2); + + // Dedupe by sector code. Stability is measured on the *focused text* (not + // on map growth): when ArrowDown stops moving focus we hit the bottom row + // ("Sektoriluokitus puuttuu") and keep reading the same text — that's the + // end-of-list signal. + const byCode = new Map(); + let stable = 0; + let lastFocused = ""; + + const ingest = (focused: string): void => { + const sector = parseSectorOption(focused); + if (sector && !byCode.has(sector.code)) { + byCode.set(sector.code, sector); + } + stable = focused !== "" && focused === lastFocused ? stable + 1 : 0; + lastFocused = focused; + }; + + // Capture the seeded position before we start pressing further — otherwise + // the first ArrowDown inside the loop would advance past option #1. + ingest(await readFocused()); + + for ( + let i = 0; + i < KEYBOARD_NAV_MAX_STEPS && stable < KEYBOARD_NAV_STABLE_THRESHOLD; + i++ + ) { + await page.keyboard.press("ArrowDown"); + await page.waitForTimeout(KEYBOARD_NAV_STEP_MS); + ingest(await readFocused()); } - // Click to focus, then type. fill() can race with Power BI's own - // focus management; click+type is more robust. - // Click to focus, then keyboard.type to enter text. Locator.fill() can - // race with Power BI's own focus management; an explicit focus+type - // through page.keyboard is more reliable. - await searchEl.click(); - await page.keyboard.type(SECTOR_OPTION_PREFIX, { delay: 30 }); - this.logger.debug("Typed slicer search query", { - query: SECTOR_OPTION_PREFIX, - }); - await page.waitForTimeout(1500); - // After the search filter applies, S15 should be the (only) remaining - // option. Power BI renders options as rows in a listbox; the visible - // text node matching the prefix is the click target. - const option = report - .getByText(new RegExp(`^${SECTOR_OPTION_PREFIX}\\s`)) + await page.keyboard.press("Escape"); + await page.waitForTimeout(500); + + return Array.from(byCode.values()); + } + + /** + * Apply the sektoriluokitus filter for a single code. Power BI's slicer + * search input persists its value across open/close cycles, so we always + * clear it before navigating — otherwise a stale "S15" query would still + * filter the listbox when BLANK's keyboard walk runs. + * + * Three option shapes are handled: + * - real S-codes: type the code, click the row whose text starts with it + * (fast — the search narrows the list to a single row); + * - BLANK/PUUTTUU: walk the listbox via keyboard ArrowDown until the + * focused row's text matches the target label, then click the focused + * element. `getByText` proved fragile here: substring matching against + * "(Tyhjä)" finds zero elements, since the row's whole-element text + * doesn't normalize to the bare label. + * + * The whole selection block runs in try/finally: Escape always fires, even + * if selection throws. Otherwise a failed selection leaves the dropdown + * open and the next sector's dropdown.click() toggles it closed — making + * its 'Hae' visibility check fail and aborting the whole run. + */ + private async applySectorFilter( + report: Frame, + page: Page, + code: string + ): Promise { + const dropdown = report + .locator(`[aria-label="${SLICER_ARIA_LABEL}"]`) .first(); - await option.waitFor({ state: "visible", timeout: 10_000 }); - await option.click({ timeout: 10_000 }); - this.logger.debug("Selected sector option", { - sector: SECTOR_OPTION_PREFIX, - }); + await dropdown.waitFor({ state: "visible", timeout: 15_000 }); + await dropdown.click({ timeout: 10_000 }); + this.logger.debug("Opened slicer dropdown", { code }); + await page.waitForTimeout(1500); - // Close the dropdown so it doesn't obscure the per-visual overflow menu. - await page.keyboard.press("Escape"); - await page.waitForTimeout(2000); + try { + const searchHandle = await report.evaluateHandle(() => { + const inputs = [ + ...document.querySelectorAll( + 'input[aria-label="Hae"], input[placeholder="Hae"]' + ), + ]; + return ( + inputs.find((el) => { + const r = el.getBoundingClientRect(); + return r.width > 0 && r.height > 0; + }) ?? null + ); + }); + const searchEl = searchHandle.asElement(); + if (!searchEl) { + throw new Error( + "No visible 'Hae' search input found after opening slicer dropdown" + ); + } + + await searchEl.click(); + // Clear any prior search query so a stale filter from the previous + // sector doesn't shrink the listbox under our keyboard walk. + await page.keyboard.press("ControlOrMeta+A"); + await page.keyboard.press("Backspace"); + await page.waitForTimeout(200); + + if (code === BLANK_CODE) { + await this.selectOptionByKeyboard(report, page, BLANK_LABEL); + } else if (code === MISSING_CODE) { + await this.selectOptionByKeyboard(report, page, MISSING_LABEL); + } else { + await page.keyboard.type(code, { delay: 30 }); + await page.waitForTimeout(1500); + // Anchor on " " (trailing space) so e.g. "S1313" doesn't also + // match "S131311" — the prefix collision the deeper sub-codes create. + const option = report.getByText(new RegExp(`^${code}\\s`)).first(); + await option.waitFor({ state: "visible", timeout: 10_000 }); + await option.click({ timeout: 10_000 }); + } + this.logger.debug("Selected sector option", { code }); + } finally { + // Always close the dropdown — even on failure — so the next sector + // opens a clean slicer rather than toggling this one shut. + await page.keyboard.press("Escape").catch(() => { + // intentional: don't mask the original selection error + }); + await page.waitForTimeout(2000); + } + } + + /** + * Walk the (already-open) slicer listbox with ArrowDown until the focused + * row's `innerText` exactly matches `targetLabel`, then click the focused + * element. Reuses the same focus-stability mechanism as `discoverSectors`: + * after KEYBOARD_NAV_STABLE_THRESHOLD consecutive identical reads we + * conclude the target isn't in the list and throw. + */ + private async selectOptionByKeyboard( + report: Frame, + page: Page, + targetLabel: string + ): Promise { + // Seed focus into the listbox (ArrowDown from the search input crosses + // the widget boundary onto the first option). + await page.keyboard.press("ArrowDown"); + await page.waitForTimeout(KEYBOARD_NAV_STEP_MS * 2); + + const readFocused = (): Promise => + report.evaluate(() => { + const el = document.activeElement as HTMLElement | null; + if (!el) { + return ""; + } + const text = el.innerText.trim(); + if (text) { + return text; + } + return (el.getAttribute("aria-label") ?? "").trim(); + }); + + let lastFocused = ""; + let stable = 0; + + for ( + let i = 0; + i < KEYBOARD_NAV_MAX_STEPS && stable < KEYBOARD_NAV_STABLE_THRESHOLD; + i++ + ) { + const focused = await readFocused(); + if (focused === targetLabel) { + const handle = await report.evaluateHandle( + () => document.activeElement + ); + const element = handle.asElement(); + if (!element) { + throw new Error( + `Focused element vanished while selecting "${targetLabel}"` + ); + } + await element.click({ timeout: 10_000 }); + await page.waitForTimeout(500); + return; + } + stable = focused !== "" && focused === lastFocused ? stable + 1 : 0; + lastFocused = focused; + await page.keyboard.press("ArrowDown"); + await page.waitForTimeout(KEYBOARD_NAV_STEP_MS); + } + + throw new Error( + `Option "${targetLabel}" not found via keyboard navigation` + ); } private async openExportAndDownload(report: Frame, page: Page) { @@ -277,14 +611,9 @@ export class XlsxDownloader { ); } - // Scroll the positive-decisions visual into view so the table's - // overflow button is positioned at a known location inside the iframe. await tableEl.scrollIntoViewIfNeeded(); await page.waitForTimeout(1000); - // Hover the centre of the visual to trigger Power BI's overflow - // chrome. Empirically, centre-hover reveals the .vcMenuBtn; top-right - // hover does not (the buttons aren't anchored to the title bar). const box = await tableEl.boundingBox(); if (!box) { throw new Error("Positive-decisions table has no bounding box"); @@ -300,10 +629,6 @@ export class XlsxDownloader { ); await page.waitForTimeout(1500); - // .vcMenuBtn is a sibling of .visualContainer (not a child) — Power BI - // renders visual chrome in a separate overlay layer. Since only the - // currently-hovered visual has its button visible, an unscoped lookup - // for the *visible* .vcMenuBtn safely targets the Myönteiset table. const overflow = report .locator( `${PBI_OVERFLOW_BTN_SELECTOR}, [aria-label="${PBI_OVERFLOW_BTN_ARIA}"]` @@ -316,40 +641,23 @@ export class XlsxDownloader { }); await page.waitForTimeout(1000); - // Click "Vie tiedot" — this opens an export-options dialog (Mitkä - // tiedot haluat viedä?) with format pre-selected to .xlsx and a "Vie" - // primary button. The download fires when we click the dialog's Vie. const exportMenu = report.getByText(/^(Vie tiedot|Export data)/i).first(); await exportMenu.waitFor({ state: "visible", timeout: 8_000 }); await exportMenu.click({ timeout: 10_000 }); this.logger.debug('Clicked "Vie tiedot" menu item'); - // Wait for the export-options dialog by its heading. const dialog = report .locator('[role="dialog"]') .filter({ hasText: /Mitkä tiedot haluat viedä|What data do you want/i }) .first(); await dialog.waitFor({ state: "visible", timeout: 10_000 }); - // Default format is already .xlsx (max 150 000 rows) — no need to - // change it. The primary action button is "Vie" with class - // pbi-modern-button.primaryBtn.exportButton (aria-label="Vie"). const downloadPromise = page.waitForEvent("download", { timeout: DOWNLOAD_TIMEOUT_MS, }); - // If something below throws, downloadPromise stays unawaited and Node - // would log an unhandled-rejection when the browser closes in finally. - // Attach a no-op catch so the error surfaces via the thrown click error, - // not as a separate noisy rejection. downloadPromise.catch(() => { // intentional: surface the real error from the click path below }); - // Invoke the dialog's "Vie" button's DOM .click() directly. Playwright's - // user-mode click refuses to fire because the iframe is taller than the - // viewport — the button's page-y is outside Playwright's actionability - // window, even with force=true. The button has data-testid="export-btn". - // A JS-level .click() bypasses positioning entirely and triggers the - // same handler the user click would. const exportConfirm = dialog .locator( 'button[data-testid="export-btn"], button[aria-label="Vie"], button.exportButton.primaryBtn' diff --git a/src/cli/grants-explorer/clients/xlsx-loader.test.ts b/src/cli/grants-explorer/clients/xlsx-loader.test.ts index f5bbe3a..ce4e224 100644 --- a/src/cli/grants-explorer/clients/xlsx-loader.test.ts +++ b/src/cli/grants-explorer/clients/xlsx-loader.test.ts @@ -160,6 +160,11 @@ describe("normalizeText", () => { // to a temp file, and runs the full load() pipeline. This protects against // regressions where extractBusinessId is wired up wrong and recipient_business_id // ends up always-null while the parser unit test still passes. +const TEST_SECTOR = { + code: "S15", + label: "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", +}; + describe("XlsxLoader.load() — recipient_business_id wiring", () => { let workDir: string; @@ -177,7 +182,9 @@ describe("XlsxLoader.load() — recipient_business_id wiring", () => { "Anonymous private grantee", ]); - const rows = new XlsxLoader({ logger: silentLogger }).load(filePath); + const rows = new XlsxLoader({ logger: silentLogger }).load(filePath, { + sector: TEST_SECTOR, + }); expect(rows).toHaveLength(2); expect(rows[0]?.recipient).toBe("Lapin Martat ry (0210606-0)"); @@ -186,4 +193,22 @@ describe("XlsxLoader.load() — recipient_business_id wiring", () => { expect(rows[1]?.recipient).toBe("Anonymous private grantee"); expect(rows[1]?.recipient_business_id).toBeNull(); }); + + it("tags every loaded row with the sektoriluokitus code and label", async () => { + const filePath = await writeFixtureXlsx(workDir, [ + "Lapin Martat ry (0210606-0)", + "Anonymous private grantee", + ]); + + const rows = new XlsxLoader({ logger: silentLogger }).load(filePath, { + sector: TEST_SECTOR, + }); + + expect(rows.every((r) => r.sektoriluokitus_code === TEST_SECTOR.code)).toBe( + true + ); + expect( + rows.every((r) => r.sektoriluokitus_label === TEST_SECTOR.label) + ).toBe(true); + }); }); diff --git a/src/cli/grants-explorer/clients/xlsx-loader.ts b/src/cli/grants-explorer/clients/xlsx-loader.ts index 5769838..a05aef9 100644 --- a/src/cli/grants-explorer/clients/xlsx-loader.ts +++ b/src/cli/grants-explorer/clients/xlsx-loader.ts @@ -1,7 +1,7 @@ import type { Logger } from "~clients/logger"; import XLSX from "xlsx"; -import type { GrantRow } from "../types/schemas"; +import type { GrantRow, Sector } from "../types/schemas"; import { GrantRowSchema } from "../types/schemas"; import { extractBusinessId } from "../utils/business-id"; @@ -131,6 +131,13 @@ export type XlsxLoaderOptions = { * Header mapping is by exact Finnish column name (see HEADER_TO_FIELD); column * order in the workbook does not matter. Unknown headers are ignored. */ +export type LoadOptions = { + // Sector tag attached to every row of the loaded xlsx. Required: each + // per-sector xlsx must be loaded with the matching manifest entry so the + // sektoriluokitus_code / _label columns in the DB carry traceable values. + sector: Sector; +}; + export class XlsxLoader { private logger: Logger; @@ -138,7 +145,7 @@ export class XlsxLoader { this.logger = logger; } - load(filePath: string): GrantRow[] { + load(filePath: string, options: LoadOptions): GrantRow[] { this.logger.debug("Reading xlsx file", { filePath }); const workbook = XLSX.readFile(filePath, { cellDates: true }); const sheetName = workbook.SheetNames[0]; @@ -219,6 +226,8 @@ export class XlsxLoader { purpose: normalizeText(at("purpose")), programme: normalizeText(at("programme")), region: normalizeText(at("region")), + sektoriluokitus_code: options.sector.code, + sektoriluokitus_label: options.sector.label, }; const parsed = GrantRowSchema.safeParse(candidate); @@ -254,6 +263,7 @@ export class XlsxLoader { this.logger.info("Loaded xlsx rows", { filePath, + sektoriluokitusCode: options.sector.code, rowCount: rows.length, dateNormalizationFailures, validationFailures, diff --git a/src/cli/grants-explorer/constants.ts b/src/cli/grants-explorer/constants.ts index 6ae67a9..6a25552 100644 --- a/src/cli/grants-explorer/constants.ts +++ b/src/cli/grants-explorer/constants.ts @@ -1,5 +1,10 @@ export const AGENT_NAME = "GrantsExplorerAgent"; export const AGENT_MODEL = "gpt-5-mini"; -export const DEFAULT_XLSX_PATH = "tmp/paatokset.xlsx"; +export const DEFAULT_PAATOKSET_DIR = "tmp/grants-explorer/paatokset"; +export const MANIFEST_FILE = "sectors.json"; +// Combined-dataset artifact: a single JSON array of every GrantRow, written +// to the parent of DEFAULT_PAATOKSET_DIR after every successful load. The +// per-sector xlsx files stay alongside for resume-after-failure semantics. +export const DEFAULT_COMBINED_GRANTS_FILE = "grants.json"; export const PAATOKSET_SOURCE_URL = "https://www.tutkihallintoa.fi/valtionavustukset/tutkiavustuksia/"; diff --git a/src/cli/grants-explorer/main.ts b/src/cli/grants-explorer/main.ts index 4bd2d59..33a63e2 100644 --- a/src/cli/grants-explorer/main.ts +++ b/src/cli/grants-explorer/main.ts @@ -1,22 +1,27 @@ // pnpm run:grants-explorer -// pnpm run:grants-explorer --file=tmp/paatokset.xlsx +// pnpm run:grants-explorer --dir=tmp/grants-explorer/paatokset // pnpm run:grants-explorer --refetch import "dotenv/config"; import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; import { AgentRunner } from "~clients/agent-runner"; import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; import { QuestionHandler } from "~utils/question-handler"; +import { writeCombinedGrants } from "./clients/combined-writer"; import { GrantsDatabase } from "./clients/database"; +import { readManifest } from "./clients/manifest"; import { XlsxDownloader } from "./clients/xlsx-downloader"; import { XlsxLoader } from "./clients/xlsx-loader"; import { AGENT_MODEL, AGENT_NAME, - DEFAULT_XLSX_PATH, + DEFAULT_COMBINED_GRANTS_FILE, + DEFAULT_PAATOKSET_DIR, + MANIFEST_FILE, PAATOKSET_SOURCE_URL, } from "./constants"; import { shouldRefetch } from "./should-refetch"; @@ -26,32 +31,65 @@ import { GrantsAgentOutputSchema, GrantsAgentOutputTypeSchema, } from "./types/schemas"; +import type { GrantRow } from "./types/schemas"; const logger = new Logger(); let db: GrantsDatabase | null = null; try { - const { file, refetch } = parseArgs({ logger, schema: CliArgsSchema }); - const xlsxPath = file ?? DEFAULT_XLSX_PATH; - - const exists = existsSync(xlsxPath); - if (shouldRefetch({ refetch, exists })) { - if (!exists) { - logger.info("Local xlsx missing; downloading", { xlsxPath }); + const { dir, refetch } = parseArgs({ logger, schema: CliArgsSchema }); + const destDir = dir ?? DEFAULT_PAATOKSET_DIR; + + // The manifest is the single canonical proof of a complete download: even + // if some sector xlsx files exist on disk, an absent manifest means a prior + // run was interrupted before listing every sector. Refetch handles that as + // a resume — only missing sector files are re-fetched, then manifest is + // re-written. + const manifestExists = existsSync(join(destDir, MANIFEST_FILE)); + if (shouldRefetch({ refetch, exists: manifestExists })) { + if (!manifestExists) { + logger.info("Sectors manifest missing; downloading", { destDir }); } await new XlsxDownloader({ logger, sourceUrl: PAATOKSET_SOURCE_URL, - }).download(xlsxPath); + }).download(destDir); } - const rows = new XlsxLoader({ logger }).load(xlsxPath); + const manifest = await readManifest(destDir); + logger.info("Loaded sectors manifest", { + destDir, + sectors: manifest.length, + }); db = new GrantsDatabase(logger); - db.insertRows(rows); + const loader = new XlsxLoader({ logger }); + const allRows: GrantRow[] = []; + for (const sector of manifest) { + const xlsxPath = join(destDir, `${sector.code}.xlsx`); + if (!existsSync(xlsxPath)) { + throw new Error( + `Manifest references ${sector.code} but ${xlsxPath} is missing — re-run with --refetch` + ); + } + const rows = loader.load(xlsxPath, { sector }); + db.insertRows(rows); + allRows.push(...rows); + } logger.info("Grants loaded into in-memory SQL", { rows: db.getTotalCount(), + sectors: manifest.length, + }); + + // Persist a single combined JSON of every row alongside the per-sector + // xlsx cache. Always written after a successful load so the on-disk file + // mirrors what was just loaded — downstream tools (jq/duckdb/pandas) can + // point at one canonical path without re-running the xlsx parse pipeline. + await writeCombinedGrants({ + logger, + path: join(dirname(destDir), DEFAULT_COMBINED_GRANTS_FILE), + rows: allRows, }); const agentRunner = new AgentRunner({ @@ -75,11 +113,14 @@ Schema (English column | source Finnish header): - purpose (Hyväksytty käyttötarkoitus) - programme (Haun nimi (asianumero)) - region (Alueet) region / municipality +- sektoriluokitus_code Finnish institutional sector code, e.g. "S15", "S131311" (S + 1–6 digits). Two sentinels: "BLANK" (no sector) and "PUUTTUU" (source-tagged "sector missing"). Indexed. +- sektoriluokitus_label Human-readable sector name matching the code, e.g. "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt". Notes: - Amounts and dates can be NULL; SUM/AVG handle that correctly. - For Y-tunnus equality searches use recipient_business_id = '' (preferred — indexed and exact). The full string is also available in recipient for substring/name matching. - To aggregate grants per legal entity, GROUP BY recipient_business_id (and filter out NULL when only registered entities are wanted). +- The dataset spans every Sektoriluokitus available upstream (including coarse codes like S11 and deep ones like S131311). Filter by sektoriluokitus_code to scope to one sector — e.g. WHERE sektoriluokitus_code = 'S15' reproduces the legacy NPISH-only view. Codes 'BLANK'/'PUUTTUU' are the sector-less rows; exclude them with WHERE sektoriluokitus_code LIKE 'S%' if you only want classified sectors. - Answer in the language of the user's question (Finnish or English). - Be concise and grounded in the SQL results; don't invent numbers. diff --git a/src/cli/grants-explorer/tools/sql-tool.ts b/src/cli/grants-explorer/tools/sql-tool.ts index 6521de2..64af905 100644 --- a/src/cli/grants-explorer/tools/sql-tool.ts +++ b/src/cli/grants-explorer/tools/sql-tool.ts @@ -46,23 +46,26 @@ export const createSqlQueryTool = (db: GrantsDatabase) => Table: grants Columns (English name | source Finnish header | type): -- id | (auto) | INTEGER PRIMARY KEY -- decision_date | Päätös pvm | TEXT, ISO date 'YYYY-MM-DD', may be NULL -- recipient | Saajan nimi | TEXT, full original name string (e.g. "Lapin Martat ry (0210606-0)") +- id | (auto) | INTEGER PRIMARY KEY +- decision_date | Päätös pvm | TEXT, ISO date 'YYYY-MM-DD', may be NULL +- recipient | Saajan nimi | TEXT, full original name string (e.g. "Lapin Martat ry (0210606-0)") - recipient_business_id | (extracted from Saajan nimi) | TEXT, Y-tunnus only (e.g. "0210606-0"), NULL when the recipient has no business ID (private persons, foreign entities, working groups). Indexed. -- granting_authority | Myöntäjä | TEXT, e.g. "Lapin ELY-keskus" -- case_number | Asianumero | TEXT -- amount_applied | Haettu | INTEGER, EUR, may be NULL -- amount_granted | Myönnetty | INTEGER, EUR, may be NULL -- has_eu_funding | EU-varat | INTEGER (0 or 1), 1 = EU funding present -- purpose | Hyväksytty käyttötarkoitus | TEXT, approved purpose -- programme | Haun nimi (asianumero) | TEXT, funding programme incl. programme key -- region | Alueet | TEXT, region / municipality +- granting_authority | Myöntäjä | TEXT, e.g. "Lapin ELY-keskus" +- case_number | Asianumero | TEXT +- amount_applied | Haettu | INTEGER, EUR, may be NULL +- amount_granted | Myönnetty | INTEGER, EUR, may be NULL +- has_eu_funding | EU-varat | INTEGER (0 or 1), 1 = EU funding present +- purpose | Hyväksytty käyttötarkoitus | TEXT, approved purpose +- programme | Haun nimi (asianumero) | TEXT, funding programme incl. programme key +- region | Alueet | TEXT, region / municipality +- sektoriluokitus_code | Sektoriluokitus | TEXT NOT NULL, e.g. "S11", "S15", "S131311" (S + 1–6 digits). Sentinels "BLANK"/"PUUTTUU" mark sector-less rows. Indexed. +- sektoriluokitus_label | Sektoriluokitus | TEXT NOT NULL, human-readable sector name matching the code. Rules: - Only one SELECT statement; no semicolons, no DDL/DML keywords. - Amounts and dates can be NULL; use IS NULL / IS NOT NULL where it matters. - For Y-tunnus searches prefer the indexed equality column: recipient_business_id = ''. LIKE on recipient still works for partial-name matches. +- The DB contains rows from every available sektoriluokitus. To match the historical "non-profits only" scope, filter WHERE sektoriluokitus_code = 'S15'. Codes 'BLANK' and 'PUUTTUU' are sector-less rows; exclude with WHERE sektoriluokitus_code LIKE 'S%' for classified-only analysis. Example queries: - Total granted per authority: @@ -72,7 +75,9 @@ Example queries: - EU-funded vs. not: SELECT has_eu_funding, COUNT(*) AS n, SUM(amount_granted) AS sum_eur FROM grants GROUP BY has_eu_funding - Top 10 recipients by total granted (one row per legal entity): - SELECT recipient_business_id, MAX(recipient) AS name, SUM(amount_granted) AS total FROM grants WHERE recipient_business_id IS NOT NULL GROUP BY recipient_business_id ORDER BY total DESC LIMIT 10`, + SELECT recipient_business_id, MAX(recipient) AS name, SUM(amount_granted) AS total FROM grants WHERE recipient_business_id IS NOT NULL GROUP BY recipient_business_id ORDER BY total DESC LIMIT 10 +- Grants per sektoriluokitus: + SELECT sektoriluokitus_code, MAX(sektoriluokitus_label) AS label, COUNT(*) AS n, SUM(amount_granted) AS total FROM grants GROUP BY sektoriluokitus_code ORDER BY total DESC`, parameters: z.object({ sql: z.string().describe("A single SQL SELECT query"), }), diff --git a/src/cli/grants-explorer/types/schemas.test.ts b/src/cli/grants-explorer/types/schemas.test.ts index f725970..b0c8419 100644 --- a/src/cli/grants-explorer/types/schemas.test.ts +++ b/src/cli/grants-explorer/types/schemas.test.ts @@ -18,7 +18,7 @@ describe("CliArgsSchema (grants-explorer)", () => { rawArgs: [], }); expect(args.refetch).toBe(false); - expect(args.file).toBeUndefined(); + expect(args.dir).toBeUndefined(); }); it("enables refetch when --refetch is present (no value)", () => { @@ -30,13 +30,13 @@ describe("CliArgsSchema (grants-explorer)", () => { expect(args.refetch).toBe(true); }); - it("accepts --file= as a path string", () => { + it("accepts --dir= as a directory path string", () => { const args = parseArgs({ logger: silentLogger, schema: CliArgsSchema, - rawArgs: ["--file=tmp/other.xlsx"], + rawArgs: ["--dir=tmp/other-paatokset"], }); - expect(args.file).toBe("tmp/other.xlsx"); + expect(args.dir).toBe("tmp/other-paatokset"); expect(args.refetch).toBe(false); }); diff --git a/src/cli/grants-explorer/types/schemas.ts b/src/cli/grants-explorer/types/schemas.ts index 014c2a0..3b4cee8 100644 --- a/src/cli/grants-explorer/types/schemas.ts +++ b/src/cli/grants-explorer/types/schemas.ts @@ -1,7 +1,7 @@ import { z } from "zod"; export const CliArgsSchema = z.object({ - file: z.string().optional(), + dir: z.string().optional(), // Presence-only flag. parseArgv hands us bare `true` for `--refetch` and // `undefined` when absent. Any `--refetch=` form arrives as a string // and is rejected here — preventing the historical `z.coerce.boolean()` @@ -26,6 +26,24 @@ export const GrantsAgentOutputSchema = GrantsAgentOutputTypeSchema; export type GrantsAgentOutput = z.infer; +// Sektoriluokitus = Finnish institutional sector classification (S11–S15, plus +// sub-codes). Each sector's grants are exported as a separate xlsx; the manifest +// (sectors.json) records the code/label pairs the downloader discovered, so the +// loader can tag rows back to their sector at insert time. +// Sektoriluokitus code: either a real S-code (S2..S131311 — 1 to 6 digits) +// or one of the two sentinel buckets for rows with no sector (BLANK = null +// value, PUUTTUU = the source's explicit "Sektoriluokitus puuttuu"). +const SECTOR_CODE_RE = /^(S\d{1,6}|BLANK|PUUTTUU)$/; + +export const SectorSchema = z.object({ + code: z.string().regex(SECTOR_CODE_RE), + label: z.string().min(1), +}); +export type Sector = z.infer; + +export const SectorManifestSchema = z.array(SectorSchema).min(1); +export type SectorManifest = z.infer; + // Single source of truth for one row loaded from the xlsx. Used as a runtime // validation tripwire in XlsxLoader: if the per-cell normalizers ever produce // a value that violates this shape (off-by-one bug, schema drift, etc.), the @@ -49,6 +67,10 @@ export const GrantRowSchema = z.object({ purpose: z.string().nullable(), programme: z.string().nullable(), region: z.string().nullable(), + // Sektoriluokitus tag attached by XlsxLoader from the sector manifest. + // Both fields are required (every row originates from one sector's xlsx). + sektoriluokitus_code: z.string().regex(SECTOR_CODE_RE), + sektoriluokitus_label: z.string().min(1), }); export type GrantRow = z.infer; diff --git a/src/cli/grants-explorer/utils/sector-parser.test.ts b/src/cli/grants-explorer/utils/sector-parser.test.ts new file mode 100644 index 0000000..4ad62f7 --- /dev/null +++ b/src/cli/grants-explorer/utils/sector-parser.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from "vitest"; + +import { parseSectorOption } from "./sector-parser"; + +describe("parseSectorOption", () => { + it.each([ + [ + "S15 Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", + { + code: "S15", + label: "Kotitalouksia palvelevat voittoa tavoittelemattomat järjestöt", + }, + ], + ["S2 Ulkomaat", { code: "S2", label: "Ulkomaat" }], + ["S11 Yritykset", { code: "S11", label: "Yritykset" }], + ["S141 Työnantaja", { code: "S141", label: "Työnantaja" }], + [ + "S1242 Muut yhteissijoitusyritykset", + { code: "S1242", label: "Muut yhteissijoitusyritykset" }, + ], + [ + "S13111 Valtion budjettitalous", + { code: "S13111", label: "Valtion budjettitalous" }, + ], + ["S131311 Kunnat", { code: "S131311", label: "Kunnat" }], + // Non-breaking space (U+00A0) between code and label — Power BI + // renders one occasionally; the \s+ in the regex covers it. + ["S14 Kotitaloudet", { code: "S14", label: "Kotitaloudet" }], + // Surrounding whitespace is trimmed before parsing. + [" S12212 Talletuspankit ", { code: "S12212", label: "Talletuspankit" }], + // Special buckets map to sentinel codes. + ["(Tyhjä)", { code: "BLANK", label: "(Tyhjä)" }], + [ + "Sektoriluokitus puuttuu", + { code: "PUUTTUU", label: "Sektoriluokitus puuttuu" }, + ], + ])("parses %j", (input, expected) => { + expect(parseSectorOption(input)).toEqual(expected); + }); + + it.each([ + "", + " ", + "S15", // no label + "S15 ", // label is whitespace only after trim + "X15 Something", // wrong prefix + "s15 lowercase prefix", + "S1234567 too many digits", // 7 digits, beyond S\d{1,6} + "15 Kotitaloudet", // missing S + "Valitse kaikki", // select-all control, not an option + ])("returns null for non-option input %j", (input) => { + expect(parseSectorOption(input)).toBeNull(); + }); +}); diff --git a/src/cli/grants-explorer/utils/sector-parser.ts b/src/cli/grants-explorer/utils/sector-parser.ts new file mode 100644 index 0000000..a94fba8 --- /dev/null +++ b/src/cli/grants-explorer/utils/sector-parser.ts @@ -0,0 +1,40 @@ +import type { Sector } from "../types/schemas"; + +// Slicer options come in three shapes: +// 1. A coded sector: "