Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions internal/cbm/sqlite_writer.c
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,7 @@ static uint8_t *build_node_record(const CBMDumpNode *n, int *out_len) {
}

// Build an edges table record: (id, project, source_id, target_id, type, properties)
// url_path_gen is a VIRTUAL generated column — NOT stored in the record.
// url_path_gen and local_name_gen are VIRTUAL generated columns — NOT stored in the record.
static uint8_t *build_edge_record(const CBMDumpEdge *e, int *out_len) {
RecordBuilder r;
rec_init(&r);
Expand Down Expand Up @@ -993,16 +993,17 @@ static uint8_t *build_index_entry_text_int_text_rowid(const char *t1, int64_t va
return cell;
}

// Build UNIQUE index entry for (text, text) + rowid (e.g., nodes unique(project, qualified_name))
// Build UNIQUE index entry for (int64, int64, text) + rowid (edges unique(source_id, target_id,
// type))
static uint8_t *build_index_entry_unique_2int_text_rowid(int64_t v1, int64_t v2, const char *text,
int64_t rowid, int *out_len) {
// Build UNIQUE index entry for (int64, int64, text, text) + rowid — edges
// unique(source_id, target_id, type, local_name_gen) (#768).
static uint8_t *build_index_entry_unique_2int_2text_rowid(int64_t v1, int64_t v2, const char *text,
const char *text2, int64_t rowid,
int *out_len) {
RecordBuilder r;
rec_init(&r);
rec_add_int(&r, v1);
rec_add_int(&r, v2);
rec_add_text(&r, text);
rec_add_text(&r, text2);
rec_add_int(&r, rowid);
int payload_len = 0;
uint8_t *payload = rec_finalize(&r, &payload_len);
Expand Down Expand Up @@ -1515,7 +1516,7 @@ static int cmp_edge_by_url_path(const void *a, const void *b) {
return cmp_i64(g_sort_edges[ia].id, g_sort_edges[ib].id);
}

// autoindex_edges_1: UNIQUE(source_id, target_id, type) + rowid
// autoindex_edges_1: UNIQUE(source_id, target_id, type, local_name_gen) + rowid (#768)
static int cmp_edge_by_src_tgt_type(const void *a, const void *b) {
int ia = *(const int *)a;
int ib = *(const int *)b;
Expand All @@ -1531,6 +1532,10 @@ static int cmp_edge_by_src_tgt_type(const void *a, const void *b) {
if (c) {
return c;
}
c = strcmp(safe_str(g_sort_edges[ia].local_name), safe_str(g_sort_edges[ib].local_name));
if (c) {
return c;
}
return cmp_i64(g_sort_edges[ia].id, g_sort_edges[ib].id);
}

Expand Down Expand Up @@ -1567,8 +1572,8 @@ static uint8_t *ecell_proj_source_type(const CBMDumpEdge *e, int *out_len) {
return build_index_entry_text_int_text_rowid(e->project, e->source_id, e->type, e->id, out_len);
}
static uint8_t *ecell_src_tgt_type(const CBMDumpEdge *e, int *out_len) {
return build_index_entry_unique_2int_text_rowid(e->source_id, e->target_id, e->type, e->id,
out_len);
return build_index_entry_unique_2int_2text_rowid(e->source_id, e->target_id, e->type,
safe_str(e->local_name), e->id, out_len);
}
static uint8_t *ecell_url_path(const CBMDumpEdge *e, int *out_len) {
const char *url = (e->url_path && e->url_path[0] != '\0') ? e->url_path : NULL;
Expand Down Expand Up @@ -2135,13 +2140,20 @@ static int write_db_after_nodes(write_db_ctx_t *w, uint32_t nodes_root) {
"CREATE INDEX idx_nodes_name ON nodes(project, name)"},
{"index", "idx_nodes_file", "nodes", idx_nodes_file_root,
"CREATE INDEX idx_nodes_file ON nodes(project, file_path)"},
// local_name_gen + widened UNIQUE (#768): must stay semantically
// identical to init_schema in src/store/store.c, and the hand-built
// sqlite_autoindex_edges_1 (cmp_edge_by_src_tgt_type +
// ecell_src_tgt_type) must produce exactly the values SQLite computes
// for local_name_gen, or integrity_check fails on the dumped DB.
{"table", "edges", "edges", edges_root,
"CREATE TABLE edges (\n\t\tid INTEGER PRIMARY KEY AUTOINCREMENT,\n\t\tproject TEXT NOT "
"NULL REFERENCES projects(name) ON DELETE CASCADE,\n\t\tsource_id INTEGER NOT NULL "
"REFERENCES nodes(id) ON DELETE CASCADE,\n\t\ttarget_id INTEGER NOT NULL REFERENCES "
"nodes(id) ON DELETE CASCADE,\n\t\ttype TEXT NOT NULL,\n\t\tproperties TEXT DEFAULT "
"'{}',\n\t\turl_path_gen TEXT GENERATED ALWAYS AS "
"(json_extract(properties,'$.url_path')),\n\t\tUNIQUE(source_id, target_id, type)\n\t)"},
"(json_extract(properties,'$.url_path')),\n\t\tlocal_name_gen TEXT GENERATED ALWAYS AS "
"(CASE WHEN type='IMPORTS' THEN coalesce(json_extract(properties,'$.local_name'),'') "
"ELSE '' END),\n\t\tUNIQUE(source_id, target_id, type, local_name_gen)\n\t)"},
{"index", "sqlite_autoindex_edges_1", "edges", autoindex_edges_root, NULL},
{"index", "idx_edges_source", "edges", idx_edges_source_root,
"CREATE INDEX idx_edges_source ON edges(source_id, type)"},
Expand Down
5 changes: 5 additions & 0 deletions internal/cbm/sqlite_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ typedef struct {
const char *type;
const char *properties; // JSON string
const char *url_path; // extracted from properties by Go (for idx_edges_url_path)
const char *local_name; // for IMPORTS edges: the UNESCAPED
// json_extract(properties,'$.local_name') value; ""/NULL
// otherwise. Feeds sqlite_autoindex_edges_1 — must match
// what SQLite computes for the local_name_gen column or
// integrity_check reports the row missing from the index.
} CBMDumpEdge;

typedef struct {
Expand Down
114 changes: 95 additions & 19 deletions src/graph_buffer/graph_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,11 @@ static inline void *intptr_to_ptr(intptr_t v) {

/* ── Internal types ──────────────────────────────────────────────── */

/* Edge key for dedup hash table — composite key as string "srcID:tgtID:type" */
#define EDGE_KEY_BUF CBM_SZ_128
/* Edge key for dedup hash table — composite key as string "srcID:tgtID:type",
* plus ":local_name" for IMPORTS edges (#768). 256 bytes fit two int64s, the
* type and a ~200-char local_name verbatim; longer local_names are re-keyed
* with a hash of the full name in make_edge_key (never silently truncated). */
#define EDGE_KEY_BUF CBM_SZ_256

/* Per-type or per-key edge list stored in hash tables as values */
typedef CBM_DYN_ARRAY(const cbm_gbuf_edge_t *) edge_ptr_array_t;
Expand Down Expand Up @@ -135,7 +138,52 @@ static void make_id_key(char *buf, size_t bufsz, int64_t id) {
snprintf(buf, bufsz, "%lld", (long long)id);
}

static void make_edge_key(char *buf, size_t bufsz, int64_t src, int64_t tgt, const char *type) {
/* FNV-1a 64-bit over a byte slice — for re-keying oversized local_names. */
static uint64_t fnv1a64(const char *s, size_t len) {
uint64_t h = 14695981039346656037ULL;
for (size_t i = 0; i < len; i++) {
h ^= (uint8_t)s[i];
h *= 1099511628211ULL;
}
return h;
}

/* IMPORTS edges carry exactly one imported symbol's local_name (#768): two
* named imports from the same specifier resolve to the same (source,
* target) pair but are distinct symbols. Key on local_name too so the
* second import doesn't dedup-collide with and overwrite the first —
* every pass that walks IMPORTS edges (pass_calls.c, pass_usages.c,
* pass_semantic.c, pass_lsp_cross.c) expects one local_name per edge, so
* losing an edge here silently breaks cross-file call resolution for
* whichever symbol got dropped, not just "who imports X" queries. Other
* edge types keep the plain (source,target,type) key: collapsing repeat
* edges of the same type between the same two nodes (e.g. multiple call
* sites) into one is the existing, intended dedup behavior there.
*
* A local_name too long for the key buffer is re-keyed with an FNV-1a hash
* of the FULL name instead of being truncated — a truncated key would
* collide two long names sharing a prefix and silently drop an edge again.
* The hash key is prefixed with byte 0x01, which cannot appear in the raw
* JSON slice (control characters must be \u-escaped in JSON), so hash keys
* can never collide with verbatim keys. */
static void make_edge_key(char *buf, size_t bufsz, int64_t src, int64_t tgt, const char *type,
const char *properties_json) {
if (properties_json && strcmp(type, "IMPORTS") == 0) {
static const char local_name_key[] = "\"local_name\":\"";
const char *ln = strstr(properties_json, local_name_key);
if (ln) {
ln += sizeof(local_name_key) - 1;
const char *end = strchr(ln, '"');
size_t ln_len = end ? (size_t)(end - ln) : strlen(ln);
int n = snprintf(buf, bufsz, "%lld:%lld:%s:%.*s", (long long)src, (long long)tgt, type,
(int)ln_len, ln);
if (n < 0 || (size_t)n >= bufsz) {
snprintf(buf, bufsz, "%lld:%lld:%s:\x01%016llx", (long long)src, (long long)tgt,
type, (unsigned long long)fnv1a64(ln, ln_len));
}
return;
}
}
snprintf(buf, bufsz, "%lld:%lld:%s", (long long)src, (long long)tgt, type);
}

Expand Down Expand Up @@ -244,7 +292,7 @@ static void remove_node_from_ptr_array(node_ptr_array_t *arr, int64_t node_id) {
static void unindex_edge(cbm_gbuf_t *gb, const cbm_gbuf_edge_t *e) {
char key[EDGE_KEY_BUF];

make_edge_key(key, sizeof(key), e->source_id, e->target_id, e->type);
make_edge_key(key, sizeof(key), e->source_id, e->target_id, e->type, e->properties_json);
const char *ekey = cbm_ht_get_key(gb->edge_by_key, key);
cbm_ht_delete(gb->edge_by_key, key);
free((void *)ekey);
Expand Down Expand Up @@ -919,7 +967,7 @@ int64_t cbm_gbuf_insert_edge(cbm_gbuf_t *gb, int64_t source_id, int64_t target_i

/* Check for dedup */
char key[EDGE_KEY_BUF];
make_edge_key(key, sizeof(key), source_id, target_id, type);
make_edge_key(key, sizeof(key), source_id, target_id, type, properties_json);

cbm_gbuf_edge_t *existing = cbm_ht_get(gb->edge_by_key, key);
if (existing) {
Expand Down Expand Up @@ -1032,7 +1080,8 @@ int cbm_gbuf_delete_edges_by_type(cbm_gbuf_t *gb, const char *type) {
cbm_gbuf_edge_t *e = gb->edges.items[i];
if (strcmp(e->type, type) == 0) {
char key[EDGE_KEY_BUF];
make_edge_key(key, sizeof(key), e->source_id, e->target_id, e->type);
make_edge_key(key, sizeof(key), e->source_id, e->target_id, e->type,
e->properties_json);
const char *ekey = cbm_ht_get_key(gb->edge_by_key, key);
cbm_ht_delete(gb->edge_by_key, key);
free((void *)ekey);
Expand Down Expand Up @@ -1182,23 +1231,24 @@ int cbm_gbuf_merge(cbm_gbuf_t *dst, cbm_gbuf_t *src) {

/* ── Dump / Flush ────────────────────────────────────────────────── */

/* Extract url_path value from a properties JSON string.
/* Extract a string property from a properties JSON string.
* Returns heap-allocated string or NULL. Caller must free.
* Parses real JSON: the dump writer feeds this value into idx_edges_url_path,
* whose backing column is GENERATED AS json_extract(properties,'$.url_path').
* Parses real JSON: the dump writer feeds these values into indexes whose
* backing columns are GENERATED AS json_extract(properties,'$.<key>').
* Naive byte slicing returned the ESCAPED text (and cut at embedded \\")
* while json_extract yields the unescaped value — the mismatch left rows
* "missing from index idx_edges_url_path" under PRAGMA integrity_check. */
static char *extract_url_path(const char *props) {
if (!props || !strstr(props, "\"url_path\"")) {
* "missing from index idx_edges_url_path" under PRAGMA integrity_check.
* key_quoted ("\"key\"") is a fast pre-filter to skip the JSON parse. */
static char *extract_prop_string(const char *props, const char *key_quoted, const char *key) {
if (!props || !strstr(props, key_quoted)) {
return NULL;
}
yyjson_doc *doc = yyjson_read(props, strlen(props), 0);
if (!doc) {
return NULL;
}
char *out = NULL;
yyjson_val *v = yyjson_obj_get(yyjson_doc_get_root(doc), "url_path");
yyjson_val *v = yyjson_obj_get(yyjson_doc_get_root(doc), key);
if (v && yyjson_is_str(v)) {
const char *sv = yyjson_get_str(v);
out = cbm_strndup(sv, strlen(sv));
Expand All @@ -1207,6 +1257,16 @@ static char *extract_url_path(const char *props) {
return out;
}

static char *extract_url_path(const char *props) {
return extract_prop_string(props, "\"url_path\"", "url_path");
}

/* local_name feeds the hand-built sqlite_autoindex_edges_1 — its backing
* column local_name_gen is GENERATED only for IMPORTS edges (#768). */
static char *extract_local_name(const char *props) {
return extract_prop_string(props, "\"local_name\"", "local_name");
}

/* Remap a temp edge ID to its final sequential ID, or 0 if out of range. */
static int64_t remap_id(const int64_t *temp_to_final, int64_t max_temp_id, int64_t temp_id) {
return (temp_id < max_temp_id) ? temp_to_final[temp_id] : 0;
Expand Down Expand Up @@ -1265,9 +1325,11 @@ static CBMDumpNode *build_dump_nodes(cbm_gbuf_t *gb, int live_count, int64_t *te
return dump_nodes;
}

/* Build dump-ready edge array with remapped IDs. Returns url_paths via out param. */
/* Build dump-ready edge array with remapped IDs. Returns url_paths and
* local_names (heap string arrays owned by the caller) via out params. */
static CBMDumpEdge *build_dump_edges(cbm_gbuf_t *gb, const int64_t *temp_to_final,
int64_t max_temp_id, int *out_count, char ***out_url_paths) {
int64_t max_temp_id, int *out_count, char ***out_url_paths,
char ***out_local_names) {
/* Count valid edges (both endpoints resolved) */
int valid_edges = 0;
for (int i = 0; i < gb->edges.count; i++) {
Expand All @@ -1281,6 +1343,7 @@ static CBMDumpEdge *build_dump_edges(cbm_gbuf_t *gb, const int64_t *temp_to_fina
CBMDumpEdge *dump_edges =
malloc((size_t)(valid_edges > 0 ? valid_edges : SKIP_ONE) * sizeof(CBMDumpEdge));
char **url_paths = calloc((size_t)(valid_edges > 0 ? valid_edges : SKIP_ONE), sizeof(char *));
char **local_names = calloc((size_t)(valid_edges > 0 ? valid_edges : SKIP_ONE), sizeof(char *));
int idx = 0;

for (int i = 0; i < gb->edges.count; i++) {
Expand All @@ -1294,6 +1357,12 @@ static CBMDumpEdge *build_dump_edges(cbm_gbuf_t *gb, const int64_t *temp_to_fina
char *url_path = extract_url_path(e->properties_json);
url_paths[idx] = url_path;

/* IMPORTS only — mirrors the local_name_gen CASE in the edges DDL. */
char *local_name = (e->type && strcmp(e->type, "IMPORTS") == 0)
? extract_local_name(e->properties_json)
: NULL;
local_names[idx] = local_name;

const char *props = e->properties_json ? e->properties_json : "{}";
dump_edges[idx] = (CBMDumpEdge){
.id = idx + SKIP_ONE,
Expand All @@ -1303,12 +1372,14 @@ static CBMDumpEdge *build_dump_edges(cbm_gbuf_t *gb, const int64_t *temp_to_fina
.type = e->type,
.properties = props,
.url_path = url_path ? url_path : "",
.local_name = local_name ? local_name : "",
};
idx++;
}

*out_count = idx;
*out_url_paths = url_paths;
*out_local_names = local_names;
return dump_edges;
}

Expand Down Expand Up @@ -1360,12 +1431,15 @@ static void log_dump_summary(int node_count, int edge_count) {
cbm_log_info("gbuf.dump", "nodes", b1, "edges", b2);
}

static void free_dump_resources(char **url_paths, int edge_count, CBMDumpEdge *dump_edges,
CBMDumpNode *dump_nodes, int64_t *temp_to_final) {
static void free_dump_resources(char **url_paths, char **local_names, int edge_count,
CBMDumpEdge *dump_edges, CBMDumpNode *dump_nodes,
int64_t *temp_to_final) {
for (int i = 0; i < edge_count; i++) {
free(url_paths[i]);
free(local_names[i]);
}
free(url_paths);
free(local_names);
free(dump_edges);
free(dump_nodes);
free(temp_to_final);
Expand Down Expand Up @@ -1469,10 +1543,12 @@ int cbm_gbuf_dump_to_sqlite(cbm_gbuf_t *gb, const char *path) {

int edge_idx = 0;
char **url_paths = NULL;
char **local_names = NULL;
CBMDumpEdge *dump_edges = NULL;
if (rc == 0) {
CBM_PROF_START(t_build_edges);
dump_edges = build_dump_edges(gb, temp_to_final, max_temp_id, &edge_idx, &url_paths);
dump_edges =
build_dump_edges(gb, temp_to_final, max_temp_id, &edge_idx, &url_paths, &local_names);
CBM_PROF_END_N("dump", "3_build_dump_edges", t_build_edges, edge_idx);
release_and_remap_vectors(gb, temp_to_final, max_temp_id);
}
Expand All @@ -1489,7 +1565,7 @@ int cbm_gbuf_dump_to_sqlite(cbm_gbuf_t *gb, const char *path) {
}

log_dump_summary(node_idx, edge_idx);
free_dump_resources(url_paths, edge_idx, dump_edges, dump_nodes, temp_to_final);
free_dump_resources(url_paths, local_names, edge_idx, dump_edges, dump_nodes, temp_to_final);
free(src_nodes);
return rc;
}
Expand Down
7 changes: 5 additions & 2 deletions src/pipeline/artifact.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
#include <stdbool.h>

/* Schema version — increment when DB schema changes (new tables/indexes).
* Import refuses artifacts with schema_version > current. */
#define CBM_ARTIFACT_SCHEMA_VERSION 1
* Import refuses artifacts with schema_version > current.
* v2: edges uniqueness widened to (source_id, target_id, type,
* local_name_gen) so sibling named imports coexist (#768) — old
* binaries cannot upsert against the widened constraint. */
#define CBM_ARTIFACT_SCHEMA_VERSION 2

#define CBM_ARTIFACT_FILENAME "graph.db.zst"
#define CBM_ARTIFACT_META "artifact.json"
Expand Down
Loading
Loading