From b4070c5d44b09b4d66aec7ef2ccfe1cd2e382bba Mon Sep 17 00:00:00 2001 From: Martin Vogel Date: Fri, 3 Jul 2026 21:04:20 +0200 Subject: [PATCH] perf(mcp): prefilter scoped search_code filelist by path_filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply path_filter while writing the scoped filelist instead of only after grep, so large indexed projects do not scan files whose hits the post-grep filter would discard anyway. The prefilter runs the SAME compiled regex via the same cbm_regexec call against the same root-relative path (Windows separators normalized first) as the post-grep filter in collect_grep_matches, which is kept as belt-and-suspenders — results-preserving by construction. Guard the edge the prefilter introduces: a path_filter matching zero indexed files now short-circuits with the normal empty result instead of invoking xargs on an empty filelist (GNU execs grep once with no operands, BSD skips). Tests: positive guard (filter matching the hit's file still returns it, out-of-filter file excluded) and zero-match guard (clean empty response, grep subprocess skipped). Both green pre-change too — perf-only change. Distilled from #756 rebased onto the post-#751 filelist writer. Co-authored-by: yutianyu111602-glitch Signed-off-by: Martin Vogel --- src/mcp/mcp.c | 79 ++++++++++++++++------- tests/test_mcp.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+), 22 deletions(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index d7ee3b63..e185b82e 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -4435,9 +4435,19 @@ static void classify_all_grep_hits(grep_match_t *gm, int gm_count, cbm_store_t * } } -/* Write indexed file list for scoped grep. Returns true if scoped. */ +/* Write indexed file list for scoped grep. Returns true if scoped. + * When a path_filter is provided, apply it here — before grep — so large + * indexed projects do not scan files only for collect_grep_matches to discard + * them later. The predicate is IDENTICAL to the post-grep filter: the same + * compiled regex run against the same root-relative path (separators + * normalized on Windows first), so prefiltering can only skip files whose + * hits would be dropped anyway — results-preserving by construction. + * *out_written receives the number of records written (0 = the filter + * excluded every indexed file). */ static bool write_scoped_filelist(cbm_mcp_server_t *srv, const char *project, const char *root_path, - const char *filelist) { + const char *filelist, bool has_path_filter, + cbm_regex_t *path_regex, int *out_written) { + *out_written = 0; cbm_store_t *pre_store = resolve_store(srv, project); if (!pre_store) { return false; @@ -4450,8 +4460,17 @@ static bool write_scoped_filelist(cbm_mcp_server_t *srv, const char *project, co } FILE *fl = fopen(filelist, "wb"); bool ok = false; + int written = 0; if (fl) { for (int fi = 0; fi < indexed_count; fi++) { + if (has_path_filter && path_regex) { +#ifdef _WIN32 + cbm_normalize_path_sep(indexed_files[fi]); +#endif + if (cbm_regexec(path_regex, indexed_files[fi], 0, NULL, 0) != CBM_REG_OK) { + continue; + } + } /* Write "/" piece-by-piece (no fixed-size buffer, so an * arbitrarily long absolute path cannot overflow). Forward slash join * so xargs doesn't treat Windows backslashes as escapes; binary mode @@ -4468,6 +4487,7 @@ static bool write_scoped_filelist(cbm_mcp_server_t *srv, const char *project, co #else (void)fputc('\0', fl); #endif + written++; } (void)fclose(fl); ok = true; @@ -4476,6 +4496,7 @@ static bool write_scoped_filelist(cbm_mcp_server_t *srv, const char *project, co free(indexed_files[fi]); } free(indexed_files); + *out_written = written; return ok; } @@ -4699,33 +4720,47 @@ static char *handle_search_code(cbm_mcp_server_t *srv, const char *args) { char filelist[CBM_SZ_256]; snprintf(filelist, sizeof(filelist), "%s.files", tmpfile); bool scoped = false; + int scoped_written = 0; - scoped = write_scoped_filelist(srv, project, root_path, filelist); + scoped = write_scoped_filelist(srv, project, root_path, filelist, has_path_filter, + has_path_filter ? &path_regex : NULL, &scoped_written); - char cmd[CBM_SZ_4K]; - build_grep_cmd(cmd, sizeof(cmd), use_regex, scoped, file_pattern, tmpfile, filelist, root_path); + /* Collect grep matches into array */ + int gm_count = 0; + grep_match_t *gm = NULL; + if (scoped && scoped_written == 0) { + /* The path_filter excluded every indexed file — nothing to scan. + * Skip the grep subprocess: xargs on an empty filelist is + * platform-dependent (GNU execs grep once with no operands, BSD + * skips), and the post-grep filter would drop every hit anyway. */ + gm = malloc(sizeof(grep_match_t)); /* empty set; freed below */ + cbm_unlink(tmpfile); + cbm_unlink(filelist); + } else { + char cmd[CBM_SZ_4K]; + build_grep_cmd(cmd, sizeof(cmd), use_regex, scoped, file_pattern, tmpfile, filelist, + root_path); + + FILE *fp = cbm_popen(cmd, "r"); + if (!fp) { + cbm_unlink(tmpfile); + if (scoped) { + cbm_unlink(filelist); + } + free(root_path); + free(pattern); + free(project); + free(file_pattern); + return cbm_mcp_text_result("search failed", true); + } - FILE *fp = cbm_popen(cmd, "r"); - if (!fp) { + gm = collect_grep_matches(fp, root_path, strlen(root_path), has_path_filter, &path_regex, + grep_limit, &gm_count); + cbm_pclose(fp); cbm_unlink(tmpfile); if (scoped) { cbm_unlink(filelist); } - free(root_path); - free(pattern); - free(project); - free(file_pattern); - return cbm_mcp_text_result("search failed", true); - } - - /* Collect grep matches into array */ - int gm_count = 0; - grep_match_t *gm = collect_grep_matches(fp, root_path, strlen(root_path), has_path_filter, - &path_regex, grep_limit, &gm_count); - cbm_pclose(fp); - cbm_unlink(tmpfile); - if (scoped) { - cbm_unlink(filelist); } /* ── Phase 2+3: Block expansion + graph ranking ──────────── */ diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 8d263c57..8d7f9c08 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -1493,6 +1493,166 @@ TEST(search_code_scoped_path_with_spaces_issue687) { PASS(); } +/* Shared fixture for the path_filter prefilter tests (PR #756 distilled): + * a project with two indexed files that both contain the search pattern — + * src/handler.go (inside the filter) and vendor/other.go (outside it). */ +static cbm_mcp_server_t *setup_prefilter_server(char *tmp, size_t tmp_sz, char *src_path, + size_t src_sz, char *vendor_path, size_t vendor_sz) { + snprintf(tmp, tmp_sz, "/tmp/cbm_srch_pref_XXXXXX"); + if (!cbm_mkdtemp(tmp)) { + return NULL; + } + char dir[640]; + snprintf(dir, sizeof(dir), "%s/src", tmp); + cbm_mkdir(dir); + snprintf(dir, sizeof(dir), "%s/vendor", tmp); + cbm_mkdir(dir); + + snprintf(src_path, src_sz, "%s/src/handler.go", tmp); + snprintf(vendor_path, vendor_sz, "%s/vendor/other.go", tmp); + FILE *fp = fopen(src_path, "w"); + if (!fp) { + return NULL; + } + fprintf(fp, "package main\n\nfunc HandleRequest() error {\n\treturn nil\n}\n"); + fclose(fp); + fp = fopen(vendor_path, "w"); + if (!fp) { + return NULL; + } + fprintf(fp, "package vendored\n\nfunc HandleRequest() error {\n\treturn nil\n}\n"); + fclose(fp); + + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + if (!srv) { + return NULL; + } + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *proj = "prefilter-search"; + cbm_mcp_server_set_project(srv, proj); + cbm_store_upsert_project(st, proj, tmp); + + cbm_node_t n1 = {.project = proj, + .label = "Function", + .name = "HandleRequest", + .qualified_name = "prefilter-search.main.HandleRequest", + .file_path = "src/handler.go", + .start_line = 3, + .end_line = 5}; + cbm_node_t n2 = {.project = proj, + .label = "Function", + .name = "HandleRequest", + .qualified_name = "prefilter-search.vendored.HandleRequest", + .file_path = "vendor/other.go", + .start_line = 3, + .end_line = 5}; + if (cbm_store_upsert_node(st, &n1) <= 0 || cbm_store_upsert_node(st, &n2) <= 0) { + cbm_mcp_server_free(srv); + return NULL; + } + return srv; +} + +static void cleanup_prefilter_dir(const char *tmp, const char *src_path, const char *vendor_path) { + char dir[640]; + unlink(src_path); + unlink(vendor_path); + snprintf(dir, sizeof(dir), "%s/src", tmp); + rmdir(dir); + snprintf(dir, sizeof(dir), "%s/vendor", tmp); + rmdir(dir); + rmdir(tmp); +} + +/* PR #756 (distilled): scoped search_code prefilters the indexed filelist by + * path_filter before grep runs. POSITIVE invariant guard: a path_filter that + * matches the file containing the hit must still return that hit (guards + * against over-filtering — the prefilter predicate must stay IDENTICAL to the + * post-grep filter in collect_grep_matches), and files outside the filter + * stay excluded. Green on pre-prefilter main too (the post-grep filter alone + * produced the same results): the change is results-preserving perf-only. */ +TEST(search_code_path_filter_prefilter_keeps_matches) { + char tmp[512], src_path[768], vendor_path[768]; + cbm_mcp_server_t *srv = setup_prefilter_server(tmp, sizeof(tmp), src_path, sizeof(src_path), + vendor_path, sizeof(vendor_path)); + ASSERT_NOT_NULL(srv); + + char *resp = cbm_mcp_server_handle( + srv, "{\"jsonrpc\":\"2.0\",\"id\":95,\"method\":\"tools/call\"," + "\"params\":{\"name\":\"search_code\"," + "\"arguments\":{\"pattern\":\"HandleRequest\",\"project\":\"prefilter-search\"," + "\"path_filter\":\"^src/\"}}}"); + ASSERT_NOT_NULL(resp); + ASSERT_TRUE(strstr(resp, "\"isError\":true") == NULL); + char *inner = extract_text_content(resp); + ASSERT_NOT_NULL(inner); + + /* The in-filter hit is returned; the out-of-filter file is not. */ + ASSERT_NOT_NULL(strstr(inner, "src/handler.go")); + ASSERT_TRUE(strstr(inner, "vendor/other.go") == NULL); + + /* Exactly the one in-filter grep match survives (same count before and + * after the prefilter — predicate identity). */ + int grep_matches = -1; + const char *g = strstr(inner, "\"total_grep_matches\":"); + if (g) { + sscanf(g, "\"total_grep_matches\":%d", &grep_matches); + } + ASSERT_EQ(grep_matches, 1); + + free(inner); + free(resp); + cbm_mcp_server_free(srv); + cleanup_prefilter_dir(tmp, src_path, vendor_path); + PASS(); +} + +/* PR #756 (distilled): path_filter matching ZERO indexed files. With the + * prefilter the scoped filelist has 0 records, and handle_search_code now + * skips the grep subprocess entirely (xargs on an empty filelist is + * platform-dependent: GNU execs grep once with no operands, BSD skips) and + * returns the empty result directly. Must be a clean zero-result response — + * no error. Green on pre-prefilter main too (there the full filelist is + * grepped and the post-grep filter drops every hit — an empty filelist is + * unreachable on main): guards the edge the prefilter introduces. */ +TEST(search_code_path_filter_matches_nothing) { + char tmp[512], src_path[768], vendor_path[768]; + cbm_mcp_server_t *srv = setup_prefilter_server(tmp, sizeof(tmp), src_path, sizeof(src_path), + vendor_path, sizeof(vendor_path)); + ASSERT_NOT_NULL(srv); + + char *resp = cbm_mcp_server_handle( + srv, "{\"jsonrpc\":\"2.0\",\"id\":96,\"method\":\"tools/call\"," + "\"params\":{\"name\":\"search_code\"," + "\"arguments\":{\"pattern\":\"HandleRequest\",\"project\":\"prefilter-search\"," + "\"path_filter\":\"^no_such_dir/\"}}}"); + ASSERT_NOT_NULL(resp); + ASSERT_TRUE(strstr(resp, "\"isError\":true") == NULL); + char *inner = extract_text_content(resp); + ASSERT_NOT_NULL(inner); + + int grep_matches = -1; + const char *g = strstr(inner, "\"total_grep_matches\":"); + if (g) { + sscanf(g, "\"total_grep_matches\":%d", &grep_matches); + } + ASSERT_EQ(grep_matches, 0); + int results = -1; + const char *r = strstr(inner, "\"total_results\":"); + if (r) { + sscanf(r, "\"total_results\":%d", &results); + } + ASSERT_EQ(results, 0); + ASSERT_TRUE(strstr(inner, "handler.go") == NULL); + ASSERT_TRUE(strstr(inner, "other.go") == NULL); + + free(inner); + free(resp); + cbm_mcp_server_free(srv); + cleanup_prefilter_dir(tmp, src_path, vendor_path); + PASS(); +} + /* issue #283: search_code with regex=true and a syntactically invalid pattern * must return an explicit error, not an empty result indistinguishable from a * legitimate no-match. */ @@ -3810,6 +3970,8 @@ SUITE(mcp) { RUN_TEST(tool_search_code_no_project); RUN_TEST(search_code_multi_word); RUN_TEST(search_code_scoped_path_with_spaces_issue687); + RUN_TEST(search_code_path_filter_prefilter_keeps_matches); + RUN_TEST(search_code_path_filter_matches_nothing); RUN_TEST(search_code_invalid_regex_errors_issue283); RUN_TEST(search_code_literal_pipe_warns_issue282); RUN_TEST(search_code_ampersand_accepted_issue272);