From 9973ff5ffc5856ac8f6861f1727c3a11e31163de Mon Sep 17 00:00:00 2001 From: sciapanCA Date: Sat, 13 Jun 2026 21:27:20 +0200 Subject: [PATCH 1/2] Add DataSource parameter for artifacts endpoints --- src/tests/test_artifact_relationships.py | 92 ++++++++++++++++++++++++ src/tests/test_fetch_artifacts.py | 70 ++++++++++++++++++ src/tests/test_response_transformer.py | 24 +++++++ src/tools/artifact_relationships.py | 38 ++++++++-- src/tools/fetch_artifacts.py | 36 +++++++++- src/utils/errors.py | 11 +++ src/utils/response_transformer.py | 14 ++++ 7 files changed, 277 insertions(+), 8 deletions(-) diff --git a/src/tests/test_artifact_relationships.py b/src/tests/test_artifact_relationships.py index ad4cbdc..a907294 100644 --- a/src/tests/test_artifact_relationships.py +++ b/src/tests/test_artifact_relationships.py @@ -372,6 +372,80 @@ async def test_explicit_profile_maps_correctly(self, mock_get_api_key): call_args = mock_client.post.call_args assert call_args[1]["json"]["profile"] == "InheritanceOnly" + # No data_source supplied => omitted from the body. + assert "dataSource" not in call_args[1]["json"] + + @pytest.mark.asyncio + @patch("tools.artifact_relationships.get_api_key_from_context") + async def test_forwards_data_source(self, mock_get_api_key): + mock_get_api_key.return_value = "test_key" + + ctx = MagicMock(spec=Context) + ctx.debug = AsyncMock() + ctx.error = AsyncMock() + + mock_response = MagicMock() + mock_response.json.return_value = { + "sourceIdentifier": "id", + "profile": "CallsOnly", + "found": True, + "relationships": [], + } + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + + mock_context = MagicMock() + mock_context.client = mock_client + mock_context.base_url = "https://app.codealive.ai" + ctx.request_context.lifespan_context = mock_context + + await get_artifact_relationships( + ctx=ctx, + identifier="id", + data_source="repo (main)", + ) + + assert mock_client.post.call_args[1]["json"]["dataSource"] == "repo (main)" + + @pytest.mark.asyncio + @patch("tools.artifact_relationships.get_api_key_from_context") + async def test_ambiguous_409_surfaces_candidate_data_sources(self, mock_get_api_key): + import httpx + + mock_get_api_key.return_value = "test_key" + + ctx = MagicMock(spec=Context) + ctx.debug = AsyncMock() + ctx.error = AsyncMock() + + mock_response = MagicMock() + mock_response.status_code = 409 + mock_response.text = ( + '{"detail": "Identifier matches 2 data sources: ' + "Name='repo (main)' Id='ds-main', Name='repo (master)' Id='ds-master'\"}" + ) + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Conflict", request=MagicMock(), response=mock_response + ) + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + + mock_context = MagicMock() + mock_context.client = mock_client + mock_context.base_url = "https://app.codealive.ai" + ctx.request_context.lifespan_context = mock_context + + with pytest.raises(ToolError) as exc: + await get_artifact_relationships(ctx=ctx, identifier="org/repo::path::Symbol") + + message = str(exc.value) + assert "409" in message + # The candidate data sources from the backend 409 must be surfaced, plus the data_source retry hint. + assert "repo (main)" in message and "repo (master)" in message + assert "data_source" in message @pytest.mark.asyncio async def test_empty_identifier_raises_tool_error(self): @@ -446,3 +520,21 @@ async def test_not_found_response_renders_correctly(self, mock_get_api_key): assert data["found"] is False assert "relationships" not in data + + def test_not_found_hint_with_data_source_suggests_retry_or_omit(self): + payload = _build_relationships_dict( + {"sourceIdentifier": "org/repo::path::S", "profile": "CallsOnly", "found": False}, + data_source="repo (main)", + ) + hint = payload["hint"] + assert "repo (main)" in hint + assert "data_source" in hint + assert "omit" in hint.lower() + + def test_not_found_hint_without_data_source_is_generic(self): + payload = _build_relationships_dict( + {"sourceIdentifier": "org/repo::path::S", "profile": "CallsOnly", "found": False}, + ) + hint = payload["hint"] + assert "data_source" not in hint + assert "fresh identifier" in hint diff --git a/src/tests/test_fetch_artifacts.py b/src/tests/test_fetch_artifacts.py index 0c10dc0..eb5f38b 100644 --- a/src/tests/test_fetch_artifacts.py +++ b/src/tests/test_fetch_artifacts.py @@ -365,6 +365,39 @@ def test_hint_absent_when_no_artifacts_have_content(self): assert "" not in result +class TestBuildArtifactsXmlDataSourceMissHint: + """When a data_source was supplied but nothing was found, hint to retry or drop it.""" + + def test_hint_when_data_source_scoped_returns_nothing(self): + data = {"artifacts": [ + {"identifier": "repo::a.ts::F", "content": None, "contentByteSize": None}, + ]} + result = _build_artifacts_xml(data, data_source="repo (main)") + assert "" in result + assert "repo (main)" in result + # Guides toward the two recovery moves. + assert "data_source" in result + assert "omit" in result.lower() + + def test_hint_when_empty_artifacts_and_data_source(self): + result = _build_artifacts_xml({"artifacts": []}, data_source="ds-main") + assert "ds-main" in result and "" in result + + def test_no_miss_hint_when_data_source_resolved_content(self): + data = {"artifacts": [ + {"identifier": "repo::a.ts::F", "content": "code", "contentByteSize": 4}, + ]} + result = _build_artifacts_xml(data, data_source="repo (main)") + assert "omit data_source" not in result + + def test_no_miss_hint_without_data_source(self): + data = {"artifacts": [ + {"identifier": "repo::a.ts::F", "content": None, "contentByteSize": None}, + ]} + result = _build_artifacts_xml(data) + assert "" not in result + + @pytest.mark.asyncio @patch('tools.fetch_artifacts.get_api_key_from_context') async def test_fetch_artifacts_returns_xml(mock_get_api_key): @@ -476,6 +509,43 @@ async def test_fetch_artifacts_posts_correct_body(mock_get_api_key): body = call_args.kwargs["json"] assert body["identifiers"] == ["id1", "id2"] assert "names" not in body + # No data_source supplied => the field is omitted (preserves the 409-on-ambiguity fallback). + assert "dataSource" not in body + + +@pytest.mark.asyncio +@patch('tools.fetch_artifacts.get_api_key_from_context') +async def test_fetch_artifacts_forwards_data_source(mock_get_api_key): + """data_source (Name or Id) is forwarded as the DataSource body field when provided.""" + mock_get_api_key.return_value = "test_key" + + ctx = MagicMock(spec=Context) + ctx.info = AsyncMock() + ctx.warning = AsyncMock() + ctx.error = AsyncMock() + + mock_response = MagicMock() + mock_response.json.return_value = {"artifacts": []} + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + + mock_codealive_context = MagicMock() + mock_codealive_context.client = mock_client + mock_codealive_context.base_url = "https://app.codealive.ai" + + ctx.request_context.lifespan_context = mock_codealive_context + ctx.request_context.headers = {"authorization": "Bearer test_key"} + + await fetch_artifacts( + ctx=ctx, + identifiers=["id1"], + data_source="repo (main)", + ) + + body = mock_client.post.call_args.kwargs["json"] + assert body["dataSource"] == "repo (main)" @pytest.mark.asyncio diff --git a/src/tests/test_response_transformer.py b/src/tests/test_response_transformer.py index ad0a359..a1c12af 100644 --- a/src/tests/test_response_transformer.py +++ b/src/tests/test_response_transformer.py @@ -294,10 +294,34 @@ def test_data_preservation(self): assert first["identifier"] == "CodeAlive-AI/codealive-mcp::src/tools/search.py::codebase_search" assert first["contentByteSize"] == 8500 assert first["description"] == "Main search function" + # Data-source identity must be surfaced (not stripped) so the agent can feed it back + # as `data_source` to disambiguate a branch-blind identifier. + assert first["dataSource"] == {"id": "685b21230e3822f4efa9d073", "name": "codealive-mcp"} assert second["path"] == "README.md" assert second["kind"] == "Chunk" assert second["description"] == "Search documentation section" + assert second["dataSource"] == {"id": "685b21230e3822f4efa9d073", "name": "codealive-mcp"} + + def test_grep_transform_surfaces_data_source(self): + response = { + "results": [ + { + "kind": "File", + "identifier": "owner/repo::src/auth.py", + "location": {"path": "src/auth.py"}, + "matchCount": 1, + "matches": [ + {"lineNumber": 3, "startColumn": 0, "endColumn": 4, "lineText": "auth"} + ], + "dataSource": {"type": "repository", "id": "ds-main", "name": "repo (main)"}, + } + ] + } + + result = transform_grep_response(response) + + assert result["results"][0]["dataSource"] == {"id": "ds-main", "name": "repo (main)"} def test_grep_transform_preserves_match_previews(self): response = { diff --git a/src/tools/artifact_relationships.py b/src/tools/artifact_relationships.py index 1e15117..2ffd371 100644 --- a/src/tools/artifact_relationships.py +++ b/src/tools/artifact_relationships.py @@ -1,6 +1,6 @@ """Artifact relationships tool implementation.""" -from typing import Any, Dict, List, Literal +from typing import Any, Dict, List, Literal, Optional from urllib.parse import urljoin import httpx @@ -37,6 +37,7 @@ async def get_artifact_relationships( identifier: str, profile: Literal["callsOnly", "inheritanceOnly", "allRelevant", "referencesOnly"] = "callsOnly", max_count_per_type: int = 50, + data_source: Optional[str] = None, ) -> Dict[str, Any]: """ Retrieve relationship groups for a single artifact by profile. @@ -84,6 +85,11 @@ async def get_artifact_relationships( - "allRelevant": calls + inheritance only; references are excluded - "referencesOnly": where-used LSP references for non-call usage max_count_per_type: Maximum related artifacts per relationship type (1–1000, default 50). + data_source: Optional data-source Name or Id used to disambiguate an identifier that + exists in more than one data source. Copy the `dataSource.name` or + `dataSource.id` from a search result. Omit it for normal lookups; if the + source identifier is ambiguous and you omit it, the backend returns a 409 + listing the candidate data sources. Returns: A dict with grouped relationships: @@ -103,6 +109,7 @@ async def get_artifact_relationships( "identifier": identifier, "profile": profile, "max_count_per_type": max_count_per_type, + "data_source": data_source, } if not identifier: @@ -143,6 +150,8 @@ async def get_artifact_relationships( "profile": api_profile, "maxCountPerType": max_count_per_type, } + if data_source: + body["dataSource"] = data_source await ctx.debug(f"Fetching {profile} relationships for artifact") @@ -156,7 +165,7 @@ async def get_artifact_relationships( log_api_response(response, request_id) response.raise_for_status() - return _build_relationships_dict(response.json()) + return _build_relationships_dict(response.json(), data_source=data_source) except (httpx.HTTPStatusError, Exception) as e: logger.bind( @@ -173,15 +182,24 @@ async def get_artifact_relationships( "(2) call semantic_search or grep_search again to get a fresh identifier — the index may have changed, " "(3) check that the artifact is a function/class (relationships are not available for non-symbol artifacts)" ), + 409: ( + "(1) the identifier exists in more than one data source — see the candidate data sources in the Detail above; each one will resolve, " + "(2) retry get_artifact_relationships with data_source set to one candidate's Name or Id; if that data source isn't the one you want, retry with the next candidate, " + "(3) do NOT invent relation results — pick from the listed data sources" + ), }, ) -def _build_relationships_dict(data: dict) -> Dict[str, Any]: +def _build_relationships_dict(data: dict, data_source: Optional[str] = None) -> Dict[str, Any]: """Build a dict representation of an artifact relationships response. FastMCP serializes the dict via pydantic_core.to_json, which preserves UTF-8 — don't reintroduce json.dumps here, it would re-escape non-ASCII identifiers. + + ``data_source`` is the selector the caller passed (if any); when the source is not + found it shapes the recovery hint so the agent can retry with another data source + or drop the selector. """ raw_source_id = data.get("sourceIdentifier") or "" raw_profile = data.get("profile") or "" @@ -208,9 +226,9 @@ def _build_relationships_dict(data: dict) -> Dict[str, Any]: counts = _build_counts(data.get("availableRelationshipCounts")) if counts is not None: payload["availableRelationshipCounts"] = counts - payload["hint"] = _build_relationship_hint(found, mcp_profile, groups, counts) + payload["hint"] = _build_relationship_hint(found, mcp_profile, groups, counts, data_source) else: - payload["hint"] = _build_relationship_hint(found, mcp_profile, [], None) + payload["hint"] = _build_relationship_hint(found, mcp_profile, [], None, data_source) return payload @@ -266,9 +284,19 @@ def _build_relationship_hint( profile: str, groups: List[Dict[str, Any]], counts: Dict[str, int] | None, + data_source: Optional[str] = None, ) -> str: """Give model-facing next-step guidance for graph traversal results.""" if not found: + if data_source: + return ( + f'No relationship data was found for this identifier in data source "{data_source}". ' + "The identifier may belong to a different data source, or the data_source value may be " + "wrong. Try: re-run with data_source set to a different candidate (use the `dataSource` " + "name or id from your search results, or call get_data_sources), or omit data_source " + "entirely — if the identifier is ambiguous you then get a 409 listing the candidate data " + "sources. Otherwise re-run semantic_search or grep_search to get a fresh identifier." + ) return ( "No relationship data was found for this identifier. Verify that the identifier came from " "a recent search/fetch result and points to a symbol-level artifact; otherwise re-run " diff --git a/src/tools/fetch_artifacts.py b/src/tools/fetch_artifacts.py index d61ab93..962df26 100644 --- a/src/tools/fetch_artifacts.py +++ b/src/tools/fetch_artifacts.py @@ -1,6 +1,6 @@ """Fetch artifacts tool implementation.""" -from typing import List, Union +from typing import List, Optional, Union from urllib.parse import urljoin import httpx @@ -17,6 +17,7 @@ async def fetch_artifacts( ctx: Context, identifiers: Union[str, List[str]], + data_source: Optional[str] = None, ) -> str: """ Retrieve the full content of code artifacts by their identifiers. @@ -39,6 +40,12 @@ async def fetch_artifacts( File: "my-org/backend::src/services/auth.py" Chunk: "my-org/backend::README.md::0042" + data_source: Optional data-source Name or Id used to disambiguate an identifier that + exists in more than one data source. Copy the `dataSource.name` or + `dataSource.id` from the search result you want. Omit it for normal lookups; + if an identifier is ambiguous and you omit it, the backend returns a 409 + listing the candidate data sources. + Returns: XML with full content and call relationships for each found artifact: @@ -92,6 +99,8 @@ async def fetch_artifacts( } body = {"identifiers": identifiers} + if data_source: + body["dataSource"] = data_source await ctx.info(f"Fetching {len(identifiers)} artifact(s)") @@ -112,7 +121,7 @@ async def fetch_artifacts( artifacts_data = response.json() # Build XML output - return _build_artifacts_xml(artifacts_data) + return _build_artifacts_xml(artifacts_data, data_source=data_source) except (httpx.HTTPStatusError, Exception) as e: # handle_api_error raises ToolError → MCP response gets isError: true @@ -124,6 +133,11 @@ async def fetch_artifacts( "(2) re-run semantic_search or grep_search to get fresh identifiers — the index may have changed, " "(3) for local repos in your working directory, use Read() on the file path instead" ), + 409: ( + "(1) the identifier exists in more than one data source — see the candidate data sources in the Detail above; each one will resolve, " + "(2) retry fetch_artifacts with data_source set to one candidate's Name or Id; if that data source isn't the one you want, retry with the next candidate, " + "(3) do NOT invent a result — pick from the listed data sources" + ), }, ) @@ -147,7 +161,7 @@ def _add_line_numbers(content: str, start_line: int = 1) -> str: return "\n".join(numbered) -def _build_artifacts_xml(data: dict) -> str: +def _build_artifacts_xml(data: dict, data_source: str | None = None) -> str: """Build XML representation of fetched artifacts. Backend DTO: Identifier (string), Content (string?), ContentByteSize (long?), @@ -157,16 +171,22 @@ def _build_artifacts_xml(data: dict) -> str: Content is emitted raw (no HTML escaping) and wrapped between newlines so the LLM sees the source code exactly as-is. + + When ``data_source`` was supplied and nothing was found, emit a recovery hint: + the identifier may live in a different data source, or the selector may be wrong — + so the agent can retry with another data source or drop the selector entirely. """ xml_parts = [""] has_any_relationships = False + emitted = 0 artifacts = data.get("artifacts", []) for artifact in artifacts: content = artifact.get("content") if content is None: continue + emitted += 1 identifier = artifact.get("identifier", "") content_byte_size = artifact.get("contentByteSize") @@ -200,6 +220,16 @@ def _build_artifacts_xml(data: dict) -> str: 'an artifact identifier.' ) + if emitted == 0 and data_source: + xml_parts.append( + f' No artifacts were found in data source "{data_source}". The identifier may ' + 'belong to a different data source, or the data_source value may be wrong. Try: ' + '(1) re-run fetch_artifacts with data_source set to a different candidate (use the ' + '`dataSource` name or id from your search results, or call get_data_sources), or ' + '(2) omit data_source entirely — if the identifier is ambiguous you then get a 409 ' + 'that lists the candidate data sources to choose from.' + ) + xml_parts.append("") return "\n".join(xml_parts) diff --git a/src/utils/errors.py b/src/utils/errors.py index 730bf29..22b662a 100644 --- a/src/utils/errors.py +++ b/src/utils/errors.py @@ -138,6 +138,17 @@ class _ErrorTemplate: "(3) verify any identifiers were returned by a recent semantic_search, grep_search, or codebase_search" ), ), + 409: _ErrorTemplate( + label="Ambiguous data source (409): the identifier matches more than one data source", + retryable=True, + retry_window="retry once with data_source set", + default_hint=( + "(1) read the candidate data sources in the Detail below — every one will resolve, " + "(2) retry with data_source set to one candidate's Name or Id; if that data source isn't " + "the one you want, retry with the next candidate, " + "(3) do not invent a result" + ), + ), 422: _ErrorTemplate( label="Data source not ready (422): The requested data source is still being indexed", retryable=True, diff --git a/src/utils/response_transformer.py b/src/utils/response_transformer.py index fd33994..2bdcbce 100644 --- a/src/utils/response_transformer.py +++ b/src/utils/response_transformer.py @@ -171,6 +171,20 @@ def _build_result_dict(path: str, result: Dict) -> Dict[str, Any]: if result.get("identifier"): info["identifier"] = result["identifier"] + # Surface the data source identity (id + name) so the agent can disambiguate a branch-blind + # identifier — the same repo indexed on two branches yields identical identifiers across data + # sources. Either value can be passed back as `data_source` to fetch_artifacts / + # get_artifact_relationships. (Previously this field was stripped here.) + data_source = result.get("dataSource") + if isinstance(data_source, dict): + ds_info: Dict[str, Any] = {} + if data_source.get("id"): + ds_info["id"] = data_source["id"] + if data_source.get("name"): + ds_info["name"] = data_source["name"] + if ds_info: + info["dataSource"] = ds_info + if result.get("contentByteSize") is not None: info["contentByteSize"] = result["contentByteSize"] From e68b73c31c376cb4fea5e3055b2a34e887bdeee8 Mon Sep 17 00:00:00 2001 From: sciapanCA Date: Sat, 13 Jun 2026 23:23:50 +0200 Subject: [PATCH 2/2] PR feedback for artifacts endpoints --- src/tests/test_artifact_relationships.py | 49 +++++++++++++++++++++--- src/tests/test_fetch_artifacts.py | 48 ++++++++++++++++++++--- src/tests/test_response_transformer.py | 4 +- src/tools/artifact_relationships.py | 6 +++ src/tools/fetch_artifacts.py | 5 +++ 5 files changed, 99 insertions(+), 13 deletions(-) diff --git a/src/tests/test_artifact_relationships.py b/src/tests/test_artifact_relationships.py index a907294..ef2a977 100644 --- a/src/tests/test_artifact_relationships.py +++ b/src/tests/test_artifact_relationships.py @@ -404,10 +404,47 @@ async def test_forwards_data_source(self, mock_get_api_key): await get_artifact_relationships( ctx=ctx, identifier="id", - data_source="repo (main)", + data_source="backend", ) - assert mock_client.post.call_args[1]["json"]["dataSource"] == "repo (main)" + assert mock_client.post.call_args[1]["json"]["dataSource"] == "backend" + + @pytest.mark.asyncio + @patch("tools.artifact_relationships.get_api_key_from_context") + async def test_whitespace_data_source_omitted(self, mock_get_api_key): + """A whitespace-only data_source normalizes to None: not sent to the backend + and not echoed in the not-found hint (preserves the 409-on-ambiguity fallback).""" + mock_get_api_key.return_value = "test_key" + + ctx = MagicMock(spec=Context) + ctx.debug = AsyncMock() + ctx.error = AsyncMock() + + mock_response = MagicMock() + mock_response.json.return_value = { + "sourceIdentifier": "id", + "profile": "CallsOnly", + "found": False, + } + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + + mock_context = MagicMock() + mock_context.client = mock_client + mock_context.base_url = "https://app.codealive.ai" + ctx.request_context.lifespan_context = mock_context + + result = await get_artifact_relationships( + ctx=ctx, + identifier="id", + data_source=" ", + ) + + assert "dataSource" not in mock_client.post.call_args[1]["json"] + # The confusing `... in data source " "` hint must not appear. + assert '" "' not in result["hint"] @pytest.mark.asyncio @patch("tools.artifact_relationships.get_api_key_from_context") @@ -424,7 +461,7 @@ async def test_ambiguous_409_surfaces_candidate_data_sources(self, mock_get_api_ mock_response.status_code = 409 mock_response.text = ( '{"detail": "Identifier matches 2 data sources: ' - "Name='repo (main)' Id='ds-main', Name='repo (master)' Id='ds-master'\"}" + "Name='backend' Id='ds-main', Name='backend-legacy' Id='ds-master'\"}" ) mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( "Conflict", request=MagicMock(), response=mock_response @@ -444,7 +481,7 @@ async def test_ambiguous_409_surfaces_candidate_data_sources(self, mock_get_api_ message = str(exc.value) assert "409" in message # The candidate data sources from the backend 409 must be surfaced, plus the data_source retry hint. - assert "repo (main)" in message and "repo (master)" in message + assert "backend" in message and "backend-legacy" in message assert "data_source" in message @pytest.mark.asyncio @@ -524,10 +561,10 @@ async def test_not_found_response_renders_correctly(self, mock_get_api_key): def test_not_found_hint_with_data_source_suggests_retry_or_omit(self): payload = _build_relationships_dict( {"sourceIdentifier": "org/repo::path::S", "profile": "CallsOnly", "found": False}, - data_source="repo (main)", + data_source="backend", ) hint = payload["hint"] - assert "repo (main)" in hint + assert "backend" in hint assert "data_source" in hint assert "omit" in hint.lower() diff --git a/src/tests/test_fetch_artifacts.py b/src/tests/test_fetch_artifacts.py index eb5f38b..46a7d88 100644 --- a/src/tests/test_fetch_artifacts.py +++ b/src/tests/test_fetch_artifacts.py @@ -372,9 +372,9 @@ def test_hint_when_data_source_scoped_returns_nothing(self): data = {"artifacts": [ {"identifier": "repo::a.ts::F", "content": None, "contentByteSize": None}, ]} - result = _build_artifacts_xml(data, data_source="repo (main)") + result = _build_artifacts_xml(data, data_source="backend") assert "" in result - assert "repo (main)" in result + assert "backend" in result # Guides toward the two recovery moves. assert "data_source" in result assert "omit" in result.lower() @@ -387,7 +387,7 @@ def test_no_miss_hint_when_data_source_resolved_content(self): data = {"artifacts": [ {"identifier": "repo::a.ts::F", "content": "code", "contentByteSize": 4}, ]} - result = _build_artifacts_xml(data, data_source="repo (main)") + result = _build_artifacts_xml(data, data_source="backend") assert "omit data_source" not in result def test_no_miss_hint_without_data_source(self): @@ -541,11 +541,49 @@ async def test_fetch_artifacts_forwards_data_source(mock_get_api_key): await fetch_artifacts( ctx=ctx, identifiers=["id1"], - data_source="repo (main)", + data_source="backend", ) body = mock_client.post.call_args.kwargs["json"] - assert body["dataSource"] == "repo (main)" + assert body["dataSource"] == "backend" + + +@pytest.mark.asyncio +@patch('tools.fetch_artifacts.get_api_key_from_context') +async def test_fetch_artifacts_whitespace_data_source_omitted(mock_get_api_key): + """A whitespace-only data_source normalizes to None: not sent to the backend + and not echoed in the not-found hint (preserves the 409-on-ambiguity fallback).""" + mock_get_api_key.return_value = "test_key" + + ctx = MagicMock(spec=Context) + ctx.info = AsyncMock() + ctx.warning = AsyncMock() + ctx.error = AsyncMock() + + mock_response = MagicMock() + mock_response.json.return_value = {"artifacts": []} + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + + mock_codealive_context = MagicMock() + mock_codealive_context.client = mock_client + mock_codealive_context.base_url = "https://app.codealive.ai" + + ctx.request_context.lifespan_context = mock_codealive_context + ctx.request_context.headers = {"authorization": "Bearer test_key"} + + result = await fetch_artifacts( + ctx=ctx, + identifiers=["id1"], + data_source=" ", + ) + + body = mock_client.post.call_args.kwargs["json"] + assert "dataSource" not in body + # The confusing `... data source " "` hint must not appear. + assert '" "' not in result @pytest.mark.asyncio diff --git a/src/tests/test_response_transformer.py b/src/tests/test_response_transformer.py index a1c12af..52058ca 100644 --- a/src/tests/test_response_transformer.py +++ b/src/tests/test_response_transformer.py @@ -314,14 +314,14 @@ def test_grep_transform_surfaces_data_source(self): "matches": [ {"lineNumber": 3, "startColumn": 0, "endColumn": 4, "lineText": "auth"} ], - "dataSource": {"type": "repository", "id": "ds-main", "name": "repo (main)"}, + "dataSource": {"type": "repository", "id": "ds-main", "name": "backend"}, } ] } result = transform_grep_response(response) - assert result["results"][0]["dataSource"] == {"id": "ds-main", "name": "repo (main)"} + assert result["results"][0]["dataSource"] == {"id": "ds-main", "name": "backend"} def test_grep_transform_preserves_match_previews(self): response = { diff --git a/src/tools/artifact_relationships.py b/src/tools/artifact_relationships.py index 2ffd371..54ab63f 100644 --- a/src/tools/artifact_relationships.py +++ b/src/tools/artifact_relationships.py @@ -112,6 +112,12 @@ async def get_artifact_relationships( "data_source": data_source, } + # Normalize the optional selector: treat empty/whitespace-only as "no selector" + # so we don't send a junk dataSource to the backend or echo it in the not-found hint. + # (tool_arguments above intentionally keeps the raw value for exact-invocation logging.) + if data_source is not None: + data_source = data_source.strip() or None + if not identifier: logger.bind(tool=_TOOL_NAME, tool_arguments=tool_arguments).warning( "Tool validation failed: artifact identifier is required" diff --git a/src/tools/fetch_artifacts.py b/src/tools/fetch_artifacts.py index 962df26..d8d4487 100644 --- a/src/tools/fetch_artifacts.py +++ b/src/tools/fetch_artifacts.py @@ -81,6 +81,11 @@ async def fetch_artifacts( # deferred tools, LiveKit agents, etc.) into a proper Python list. identifiers = coerce_stringified_list(identifiers) + # Normalize the optional selector: treat empty/whitespace-only as "no selector" + # so we don't send a junk dataSource to the backend or echo it in the not-found hint. + if data_source is not None: + data_source = data_source.strip() or None + if not identifiers: raise ToolError(f"[{_TOOL_NAME}] At least one identifier is required.")