From 00a295f2bee37cb7467fec24a282e1a5bf1d75d6 Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Tue, 28 Apr 2026 15:51:47 -0700 Subject: [PATCH 01/11] Phase 3 Stage 1: Add embed flag to stored procedure parameters Add 'embed' boolean property to ParameterMetadata for marking stored procedure parameters that should be automatically embedded via the EmbeddingService before being passed to the sproc. Config & Schema: - ParameterMetadata.cs: added Embed bool property (default false) - dab.draft.schema.json: added 'embed' to parameter array items Validation (RuntimeConfigValidator.cs): - embed:true only valid on stored-procedure entities - embed:true requires runtime.embeddings configured and enabled - embed:true cannot coexist with a default value Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- schemas/dab.draft.schema.json | 3 +- src/Config/ObjectModel/ParameterMetadata.cs | 7 ++ .../Configurations/RuntimeConfigValidator.cs | 78 +++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/schemas/dab.draft.schema.json b/schemas/dab.draft.schema.json index 41c62d4f90..4e60103a0c 100644 --- a/schemas/dab.draft.schema.json +++ b/schemas/dab.draft.schema.json @@ -1245,7 +1245,8 @@ "name": { "type": "string", "description": "Parameter name" }, "required": { "$ref": "#/$defs/boolean-or-string", "description": "Is parameter required" }, "default": { "type": ["string", "number", "boolean", "null"], "description": "Default value" }, - "description": { "type": "string", "description": "Parameter description. Since descriptions for multiple parameters are provided as a comma-separated string, individual parameter descriptions must not contain a comma (',')." } + "description": { "type": "string", "description": "Parameter description. Since descriptions for multiple parameters are provided as a comma-separated string, individual parameter descriptions must not contain a comma (',')." }, + "embed": { "type": "boolean", "description": "When true, the parameter text is automatically converted to an embedding vector via the configured embedding service before being passed to the stored procedure. Requires runtime.embeddings to be configured. Only valid on stored-procedure entities.", "default": false } } } } diff --git a/src/Config/ObjectModel/ParameterMetadata.cs b/src/Config/ObjectModel/ParameterMetadata.cs index 334979d728..e087bf7819 100644 --- a/src/Config/ObjectModel/ParameterMetadata.cs +++ b/src/Config/ObjectModel/ParameterMetadata.cs @@ -24,5 +24,12 @@ public class ParameterMetadata /// Gets or sets the default value of the parameter, if any. /// public string? Default { get; set; } + + /// + /// When true, the parameter value (text) is automatically embedded via the + /// EmbeddingService and the resulting vector is passed to the stored procedure. + /// Only valid on stored-procedure entities when runtime.embeddings is configured. + /// + public bool Embed { get; set; } } } diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index 32b2ff93b6..dff4477b86 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -98,6 +98,7 @@ public void ValidateConfigProperties() ValidateAzureLogAnalyticsAuth(runtimeConfig); ValidateFileSinkPath(runtimeConfig); ValidateEmbeddingsOptions(runtimeConfig); + ValidateEmbedParameters(runtimeConfig); } /// @@ -421,7 +422,84 @@ public void ValidateEmbeddingsOptions(RuntimeConfig runtimeConfig) subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); } } + } + + /// + /// Validates that parameters with embed=true are only used on stored-procedure entities + /// and that runtime.embeddings is configured when embed parameters are present. + /// + private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) + { + // Check once whether the embedding service is configured and enabled. + // Example: "runtime": { "embeddings": { "enabled": true, "provider": "azure-openai" } } → true + // Example: embeddings section missing or "enabled": false → false + bool embeddingsConfigured = runtimeConfig.Runtime?.Embeddings is not null + && runtimeConfig.Runtime.Embeddings.Enabled; + + // Loop through every entity in the config. + // Example entities: "Product" (table), "Category" (table), "SearchProducts" (sproc) + foreach ((string entityName, Entity entity) in runtimeConfig.Entities) + { + // Skip entities that have no parameters defined. + // Tables and views typically don't have parameters. + // Example: "Product": { "source": { "type": "table" } } → Parameters is null → skip + if (entity.Source.Parameters is null) + { + continue; + } + + // Check each parameter for the embed flag. + // Example: iterates over { "name": "query_vector", "embed": true } and { "name": "top_k", "default": "5" } + foreach (ParameterMetadata param in entity.Source.Parameters) + { + // Skip parameters that don't have embed: true. Most params are normal pass-through. + // Example: "top_k" has Embed=false (default) → skip + // Example: "query_vector" has Embed=true → continue to validation checks + if (!param.Embed) + { + continue; + } + + // Rule 1: embed:true is only valid on stored-procedure entities. + // Tables/views don't have user-supplied parameters that get passed to SQL. + // Example PASS: "SearchProducts": { "source": { "type": "stored-procedure" } } + // Example FAIL: "Product": { "source": { "type": "table", "parameters": [{"name":"x","embed":true}] } } + // → Error: "Entity 'Product': parameter 'x' has 'embed: true' but is only valid on stored-procedure entities." + if (entity.Source.Type is not EntitySourceType.StoredProcedure) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but is only valid on stored-procedure entities.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + // Rule 2: embed:true requires runtime.embeddings to be configured and enabled. + // Can't convert text to vectors without an embedding service. + // Example PASS: "embeddings": { "enabled": true, "provider": "azure-openai", "api-key": "..." } + // Example FAIL: "embeddings": { "enabled": false } or embeddings section missing + // → Error: "parameter 'query_vector' has 'embed: true' but runtime.embeddings is not configured or not enabled." + if (!embeddingsConfigured) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but runtime.embeddings is not configured or not enabled.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + + // Rule 3: embed:true with a default value is not supported. + // A default value is text (e.g., "wireless headphones") that would need embedding at startup — not supported. + // Example PASS: { "name": "query_vector", "embed": true } (no default) + // Example FAIL: { "name": "query_vector", "embed": true, "default": "wireless headphones" } + // → Error: "parameter 'query_vector' has both 'embed: true' and a 'default' value. Embed parameters cannot have default values." + if (param.Default is not null) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has both 'embed: true' and a 'default' value. Embed parameters cannot have default values.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + } + } } /// From 3ad2d26d70567cfa6da401161f6144ce9c303454 Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Tue, 28 Apr 2026 17:09:07 -0700 Subject: [PATCH 02/11] Phase 3 Stage 2: Wire EmbeddingService into stored procedure execution Core implementation of automatic text-to-vector substitution for embed:true parameters. When a user sends text for an embed parameter, DAB automatically embeds it via EmbeddingService and passes the vector to the stored procedure. New file: - ParameterEmbeddingHelper.cs: text -> TryEmbedAsync -> float[] -> JSON string substitution. Handles empty/null text (400), embedding failures (500). DI wiring: - SqlQueryEngine + SqlMutationEngine: added IEmbeddingService? (optional) - QueryEngineFactory + MutationEngineFactory: pass service through to engines Execution path: - SqlQueryEngine.ExecuteAsync (REST + GraphQL): call helper before SqlExecuteStructure construction - SqlMutationEngine.ExecuteAsync (REST POST): same Metadata type override (Approach B): - MsSqlMetadataProvider: for embed:true params where SQL reports VECTOR as varbinary/Byte[], override SystemType->String, DbType->String, SqlDbType->NVarChar. Follows existing DateTime override pattern. Gated by: embed:true AND Byte[] type. Normal varbinary params unaffected. Verified: 13 tests (5 positive + 8 negative), including semantic search, cross-language, error cases, and non-embed entity regression check. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Factories/MutationEngineFactory.cs | 17 ++- .../Resolvers/Factories/QueryEngineFactory.cs | 9 +- src/Core/Resolvers/SqlMutationEngine.cs | 18 ++- src/Core/Resolvers/SqlQueryEngine.cs | 30 ++++- .../Embeddings/ParameterEmbeddingHelper.cs | 110 ++++++++++++++++++ .../MsSqlMetadataProvider.cs | 21 ++++ 6 files changed, 191 insertions(+), 14 deletions(-) create mode 100644 src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs diff --git a/src/Core/Resolvers/Factories/MutationEngineFactory.cs b/src/Core/Resolvers/Factories/MutationEngineFactory.cs index 9b4df9f8bf..f335ba87eb 100644 --- a/src/Core/Resolvers/Factories/MutationEngineFactory.cs +++ b/src/Core/Resolvers/Factories/MutationEngineFactory.cs @@ -8,6 +8,7 @@ using Azure.DataApiBuilder.Core.Configurations; using Azure.DataApiBuilder.Core.Models; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Microsoft.AspNetCore.Http; using static Azure.DataApiBuilder.Config.DabConfigEvents; @@ -29,18 +30,11 @@ public class MutationEngineFactory : IMutationEngineFactory private readonly IHttpContextAccessor _httpContextAccessor; private readonly IAuthorizationResolver _authorizationResolver; private readonly GQLFilterParser _gQLFilterParser; + private readonly IEmbeddingService? _embeddingService; /// /// Initializes a new instance of the class. /// - /// runtimeConfigProvider. - /// queryManagerFactory - /// metadataProviderFactory. - /// cosmosClientProvider - /// queryEngineFactory. - /// httpContextAccessor. - /// authorizationResolver. - /// GqlFilterParser. public MutationEngineFactory(RuntimeConfigProvider runtimeConfigProvider, IAbstractQueryManagerFactory queryManagerFactory, IMetadataProviderFactory metadataProviderFactory, @@ -49,7 +43,8 @@ public MutationEngineFactory(RuntimeConfigProvider runtimeConfigProvider, IHttpContextAccessor httpContextAccessor, IAuthorizationResolver authorizationResolver, GQLFilterParser gQLFilterParser, - HotReloadEventHandler? handler) + HotReloadEventHandler? handler, + IEmbeddingService? embeddingService = null) { handler?.Subscribe(MUTATION_ENGINE_FACTORY_ON_CONFIG_CHANGED, OnConfigChanged); @@ -61,6 +56,7 @@ public MutationEngineFactory(RuntimeConfigProvider runtimeConfigProvider, _queryEngineFactory = queryEngineFactory; _runtimeConfigProvider = runtimeConfigProvider; _gQLFilterParser = gQLFilterParser; + _embeddingService = embeddingService; _mutationEngines = new Dictionary(); ConfigureMutationEngines(); } @@ -78,7 +74,8 @@ private void ConfigureMutationEngines() _authorizationResolver, _gQLFilterParser, _httpContextAccessor, - _runtimeConfigProvider); + _runtimeConfigProvider, + _embeddingService); _mutationEngines.Add(DatabaseType.MySQL, mutationEngine); _mutationEngines.Add(DatabaseType.MSSQL, mutationEngine); _mutationEngines.Add(DatabaseType.PostgreSQL, mutationEngine); diff --git a/src/Core/Resolvers/Factories/QueryEngineFactory.cs b/src/Core/Resolvers/Factories/QueryEngineFactory.cs index 1d2ae2935d..9ae91015ad 100644 --- a/src/Core/Resolvers/Factories/QueryEngineFactory.cs +++ b/src/Core/Resolvers/Factories/QueryEngineFactory.cs @@ -9,6 +9,7 @@ using Azure.DataApiBuilder.Core.Models; using Azure.DataApiBuilder.Core.Services.Cache; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Microsoft.AspNetCore.Http; using Microsoft.Extensions.Logging; @@ -33,6 +34,7 @@ public class QueryEngineFactory : IQueryEngineFactory private readonly GQLFilterParser _gQLFilterParser; private readonly DabCacheService _cache; private readonly ILogger _logger; + private readonly IEmbeddingService? _embeddingService; /// public QueryEngineFactory(RuntimeConfigProvider runtimeConfigProvider, @@ -44,7 +46,8 @@ public QueryEngineFactory(RuntimeConfigProvider runtimeConfigProvider, GQLFilterParser gQLFilterParser, ILogger logger, DabCacheService cache, - HotReloadEventHandler? handler) + HotReloadEventHandler? handler, + IEmbeddingService? embeddingService = null) { handler?.Subscribe(QUERY_ENGINE_FACTORY_ON_CONFIG_CHANGED, OnConfigChanged); _queryEngines = new Dictionary(); @@ -57,6 +60,7 @@ public QueryEngineFactory(RuntimeConfigProvider runtimeConfigProvider, _gQLFilterParser = gQLFilterParser; _cache = cache; _logger = logger; + _embeddingService = embeddingService; ConfigureQueryEngines(); } @@ -75,7 +79,8 @@ public void ConfigureQueryEngines() _gQLFilterParser, _logger, _runtimeConfigProvider, - _cache); + _cache, + _embeddingService); _queryEngines.Add(DatabaseType.MSSQL, queryEngine); _queryEngines.Add(DatabaseType.MySQL, queryEngine); _queryEngines.Add(DatabaseType.PostgreSQL, queryEngine); diff --git a/src/Core/Resolvers/SqlMutationEngine.cs b/src/Core/Resolvers/SqlMutationEngine.cs index 204d71d44f..73d78997c0 100644 --- a/src/Core/Resolvers/SqlMutationEngine.cs +++ b/src/Core/Resolvers/SqlMutationEngine.cs @@ -16,6 +16,7 @@ using Azure.DataApiBuilder.Core.Resolvers.Sql_Query_Structures; using Azure.DataApiBuilder.Core.Services; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Azure.DataApiBuilder.Service.GraphQLBuilder; using Azure.DataApiBuilder.Service.GraphQLBuilder.Mutations; @@ -41,6 +42,7 @@ public class SqlMutationEngine : IMutationEngine private readonly IHttpContextAccessor _httpContextAccessor; private readonly GQLFilterParser _gQLFilterParser; private readonly RuntimeConfigProvider _runtimeConfigProvider; + private readonly IEmbeddingService? _embeddingService; public const string IS_UPDATE_RESULT_SET = "IsUpdateResultSet"; private const string TRANSACTION_EXCEPTION_ERROR_MSG = "An unexpected error occurred during the transaction execution"; public const string SINGLE_INPUT_ARGUEMENT_NAME = "item"; @@ -60,7 +62,8 @@ public SqlMutationEngine( IAuthorizationResolver authorizationResolver, GQLFilterParser gQLFilterParser, IHttpContextAccessor httpContextAccessor, - RuntimeConfigProvider runtimeConfigProvider) + RuntimeConfigProvider runtimeConfigProvider, + IEmbeddingService? embeddingService = null) { _queryManagerFactory = queryManagerFactory; _sqlMetadataProviderFactory = sqlMetadataProviderFactory; @@ -69,6 +72,7 @@ public SqlMutationEngine( _httpContextAccessor = httpContextAccessor; _gQLFilterParser = gQLFilterParser; _runtimeConfigProvider = runtimeConfigProvider; + _embeddingService = embeddingService; } /// @@ -347,6 +351,18 @@ private static bool IsPointMutation(IMiddlewareContext context) ISqlMetadataProvider sqlMetadataProvider = _sqlMetadataProviderFactory.GetMetadataProvider(dataSourceName); IQueryBuilder queryBuilder = _queryManagerFactory.GetQueryBuilder(sqlMetadataProvider.GetDatabaseType()); IQueryExecutor queryExecutor = _queryManagerFactory.GetQueryExecutor(sqlMetadataProvider.GetDatabaseType()); + + // Phase 3: substitute embed:true parameters with embedding vectors (REST mutation path) + if (_embeddingService is not null) + { + Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + context.ResolvedParameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); + } + SqlExecuteStructure executeQueryStructure = new( context.EntityName, sqlMetadataProvider, diff --git a/src/Core/Resolvers/SqlQueryEngine.cs b/src/Core/Resolvers/SqlQueryEngine.cs index f567251771..5785198649 100644 --- a/src/Core/Resolvers/SqlQueryEngine.cs +++ b/src/Core/Resolvers/SqlQueryEngine.cs @@ -12,6 +12,7 @@ using Azure.DataApiBuilder.Core.Services; using Azure.DataApiBuilder.Core.Services.Cache; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Azure.DataApiBuilder.Service.GraphQLBuilder; using Azure.DataApiBuilder.Service.GraphQLBuilder.Queries; @@ -37,6 +38,7 @@ public class SqlQueryEngine : IQueryEngine private readonly RuntimeConfigProvider _runtimeConfigProvider; private readonly GQLFilterParser _gQLFilterParser; private readonly DabCacheService _cache; + private readonly IEmbeddingService? _embeddingService; // // Constructor. @@ -49,7 +51,8 @@ public SqlQueryEngine( GQLFilterParser gQLFilterParser, ILogger logger, RuntimeConfigProvider runtimeConfigProvider, - DabCacheService cache) + DabCacheService cache, + IEmbeddingService? embeddingService = null) { _queryFactory = queryFactory; _sqlMetadataProviderFactory = sqlMetadataProviderFactory; @@ -59,6 +62,7 @@ public SqlQueryEngine( _logger = logger; _runtimeConfigProvider = runtimeConfigProvider; _cache = cache; + _embeddingService = embeddingService; } /// @@ -140,6 +144,17 @@ await ExecuteAsync(structure, dataSourceName, isMultipleCreateOperation: true), ISqlMetadataProvider sqlMetadataProvider = _sqlMetadataProviderFactory.GetMetadataProvider(dataSourceName); if (sqlMetadataProvider.GraphQLStoredProcedureExposedNameToEntityNameMap.TryGetValue(context.Selection.Field.Name, out string? entityName)) { + // Phase 3: substitute embed:true parameters with embedding vectors (GraphQL path) + if (_embeddingService is not null) + { + Entity entity = _runtimeConfigProvider.GetConfig().Entities[entityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + parameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); + } + SqlExecuteStructure sqlExecuteStructure = new( entityName, sqlMetadataProvider, @@ -198,6 +213,19 @@ await ExecuteAsync(structure, dataSourceName, isMultipleCreateOperation: true), /// public async Task ExecuteAsync(StoredProcedureRequestContext context, string dataSourceName) { + // Phase 3: substitute embed:true parameters with embedding vectors + // before SqlExecuteStructure reads them. The helper replaces text values + // (e.g., "wireless headphones") with vector JSON strings (e.g., "[0.012,...]"). + if (_embeddingService is not null) + { + Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + context.ResolvedParameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); + } + SqlExecuteStructure structure = new( context.EntityName, _sqlMetadataProviderFactory.GetMetadataProvider(dataSourceName), diff --git a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs new file mode 100644 index 0000000000..309b8cf596 --- /dev/null +++ b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs @@ -0,0 +1,110 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Globalization; +using System.Net; +using Azure.DataApiBuilder.Config.ObjectModel; +using Azure.DataApiBuilder.Service.Exceptions; + +namespace Azure.DataApiBuilder.Core.Services.Embeddings; + +/// +/// Substitutes text values with embedding vectors for stored procedure parameters marked with embed:true. +/// +/// Called before SqlExecuteStructure construction — the substituted values flow through +/// the normal String type conversion path (thanks to the metadata type override that +/// changes VECTOR params from Byte[] to String during startup). +/// +/// Example: +/// Input: resolvedParams["query_vector"] = "wireless headphones" (user's text) +/// Output: resolvedParams["query_vector"] = "[0.012,-0.045,...,0.083]" (vector JSON string) +/// +public static class ParameterEmbeddingHelper +{ + /// + /// For each parameter marked embed:true in config, replaces the text value in + /// resolvedParams with a serialized vector string. + /// + /// The embedding call goes through EmbeddingService which has built-in FusionCache, + /// so repeated identical texts return instantly without calling the AI provider. + /// + /// + /// The parameter dictionary from the request (REST body/query string or GraphQL args). + /// Modified in-place: text values for embed params are replaced with vector JSON strings. + /// + /// + /// Parameter metadata from dab-config.json for this entity. + /// Contains the embed:true flag for each parameter. + /// + /// The embedding service to call for text → vector conversion. + /// Cancellation token from the HTTP request. + public static async Task SubstituteEmbedParametersAsync( + IDictionary resolvedParams, + List? configParams, + IEmbeddingService embeddingService, + CancellationToken cancellationToken) + { + // Nothing to do if no config params defined + if (configParams is null) + { + return; + } + + foreach (ParameterMetadata configParam in configParams) + { + // Skip non-embed params — they pass through unchanged + // Example: "top_k" has Embed=false → skip + if (!configParam.Embed) + { + continue; + } + + // Check if the request provided this parameter + // If not provided, DAB's existing required-param validation will handle it + if (!resolvedParams.TryGetValue(configParam.Name, out object? value)) + { + continue; + } + + // Validate: embed params must have non-empty text + // Example FAIL: { "query_vector": "" } → 400 error + // Example FAIL: { "query_vector": " " } → 400 error + string? text = value?.ToString(); + if (string.IsNullOrWhiteSpace(text)) + { + throw new DataApiBuilderException( + message: $"Parameter '{configParam.Name}' has 'embed: true' but the provided text is empty or whitespace.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); + } + + // Call EmbeddingService to convert text → float[1536] + // The service has built-in FusionCache (L1 + optional L2 Redis): + // First call for "wireless headphones" → calls Azure OpenAI API (~200-500ms) + // Second call for same text → cache hit, returns instantly + EmbeddingResult result = await embeddingService.TryEmbedAsync(text, cancellationToken); + + if (!result.Success || result.Embedding is null) + { + throw new DataApiBuilderException( + message: $"Failed to generate embedding for parameter '{configParam.Name}'.", + statusCode: HttpStatusCode.InternalServerError, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + + // Serialize float[] to JSON string that Azure SQL accepts as VECTOR + // Example: float[] { 0.012f, -0.045f, 0.083f } → "[0.012,-0.045,0.083]" + // + // Uses InvariantCulture to prevent European locales from using comma as decimal separator + // (e.g., German locale would produce "0,012" instead of "0.012" → invalid JSON) + string vectorJson = "[" + + string.Join(",", result.Embedding.Select(f => f.ToString("G", CultureInfo.InvariantCulture))) + + "]"; + + // Replace the text value with the vector string in-place + // SqlExecuteStructure will see this as a String value (thanks to metadata override) + // and pass it through to SQL, which auto-casts to VECTOR(N) + resolvedParams[configParam.Name] = vectorJson; + } + } +} diff --git a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs index cdb54a2ac2..12064dab48 100644 --- a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs +++ b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs @@ -223,6 +223,27 @@ protected override async Task FillSchemaForStoredProcedureAsync( parameterDefinition.Default = paramMetadata.Default; parameterDefinition.HasConfigDefault = paramMetadata.Default is not null; parameterDefinition.ConfigDefaultValue = paramMetadata.Default?.ToString(); + + // For embed:true parameters, override the SQL metadata type. + // SQL Server reports VECTOR(N) as "varbinary" → DAB maps to Byte[] → DbType.Binary. + // This causes ParseParamAsSystemType to try base64 decoding on our vector JSON string. + // + // Fix: override to String/DbType.String so the vector JSON string + // flows through ParseParamAsSystemType's "String" => param path (returns as-is). + // SQL Server auto-casts the NVARCHAR string to VECTOR at execution time. + // + // Guard conditions (ALL must be true): + // 1. This parameter has embed:true in config + // 2. SQL metadata reported it as Byte[] (the VECTOR→varbinary mapping) + // A normal varbinary param (e.g., image blob) is NOT affected — it won't have embed:true. + // + // Follows the same pattern as the DateTime DbType override above (lines 186-195). + if (paramMetadata.Embed && parameterDefinition.SystemType == typeof(byte[])) + { + parameterDefinition.SystemType = typeof(string); + parameterDefinition.DbType = System.Data.DbType.String; + parameterDefinition.SqlDbType = System.Data.SqlDbType.NVarChar; + } } } } From 74aadc76dde457e3d64efe434df25c7af9ecd365 Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Wed, 29 Apr 2026 16:01:56 -0700 Subject: [PATCH 03/11] Phase 3 Stage 3.5: Address review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements 4 fixes from independent code reviews (rubber-duck + agent-skills:code-reviewer): Fix 1 (Float precision): Changed ToString format from 'G' (G7 default for Single, ~30% precision loss) to 'R' (round-trippable). Embeddings are precision-sensitive for cosine similarity. Fix 2 (Non-MSSQL rejection): Added Rule 0 in ValidateEmbedParameters to reject embed:true on non-MSSQL data sources. The metadata type override only exists in MsSqlMetadataProvider, so PostgreSQL/MySQL/Cosmos would fail at runtime with confusing errors. Now caught at startup with clear message. Fix 3 (Non-string input validation): Added explicit type check before embedding. Handles both System.String and System.Text.Json.JsonElement (DAB wraps body values in JsonElement). Rejects Number, Boolean, Array, Object with 400. Azure OpenAI's embedding API only accepts strings — being permissive at DAB level would silently embed garbage like 'System.Object[]' for arrays. Fix 4 (Hot-reload null service hard-fail): Removed silent skip when _embeddingService is null but embed params exist. Helper now checks upfront for any embed params, then validates service is available — throws 503 if not. Prevents data integrity risk during hot-reload edge cases where embeddings config gets disabled while DAB is running. Verified: 11 tests (3 positive + 7 negative + 1 GraphQL + 1 non-embed regression). All pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Configurations/RuntimeConfigValidator.cs | 14 ++++ src/Core/Resolvers/SqlMutationEngine.cs | 16 ++-- src/Core/Resolvers/SqlQueryEngine.cs | 32 ++++--- .../Embeddings/ParameterEmbeddingHelper.cs | 83 ++++++++++++++++++- 4 files changed, 115 insertions(+), 30 deletions(-) diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index dff4477b86..bf14e778bd 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -460,6 +460,20 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) continue; } + // Rule 0: embed:true is only supported on Azure SQL / SQL Server data sources. + // The metadata type override (Byte[] → String) only exists in MsSqlMetadataProvider. + // For PostgreSQL/MySQL/Cosmos, the request would fail at runtime with a confusing + // type error. Reject at startup instead. + // Example FAIL: PostgreSQL entity with embed:true → "embed feature only supported for MSSQL" + DataSource entityDataSource = runtimeConfig.GetDataSourceFromEntityName(entityName); + if (entityDataSource.DatabaseType != DatabaseType.MSSQL) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but the data source type is '{entityDataSource.DatabaseType}'. The embed feature is currently only supported for Azure SQL / SQL Server.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + // Rule 1: embed:true is only valid on stored-procedure entities. // Tables/views don't have user-supplied parameters that get passed to SQL. // Example PASS: "SearchProducts": { "source": { "type": "stored-procedure" } } diff --git a/src/Core/Resolvers/SqlMutationEngine.cs b/src/Core/Resolvers/SqlMutationEngine.cs index 73d78997c0..75e43709b3 100644 --- a/src/Core/Resolvers/SqlMutationEngine.cs +++ b/src/Core/Resolvers/SqlMutationEngine.cs @@ -353,15 +353,13 @@ private static bool IsPointMutation(IMiddlewareContext context) IQueryExecutor queryExecutor = _queryManagerFactory.GetQueryExecutor(sqlMetadataProvider.GetDatabaseType()); // Phase 3: substitute embed:true parameters with embedding vectors (REST mutation path) - if (_embeddingService is not null) - { - Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; - await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( - context.ResolvedParameters, - entity.Source.Parameters, - _embeddingService, - _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); - } + // Helper handles the case where embed params exist but service is null (throws 503). + Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + context.ResolvedParameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); SqlExecuteStructure executeQueryStructure = new( context.EntityName, diff --git a/src/Core/Resolvers/SqlQueryEngine.cs b/src/Core/Resolvers/SqlQueryEngine.cs index 5785198649..9c98e3465a 100644 --- a/src/Core/Resolvers/SqlQueryEngine.cs +++ b/src/Core/Resolvers/SqlQueryEngine.cs @@ -145,15 +145,13 @@ await ExecuteAsync(structure, dataSourceName, isMultipleCreateOperation: true), if (sqlMetadataProvider.GraphQLStoredProcedureExposedNameToEntityNameMap.TryGetValue(context.Selection.Field.Name, out string? entityName)) { // Phase 3: substitute embed:true parameters with embedding vectors (GraphQL path) - if (_embeddingService is not null) - { - Entity entity = _runtimeConfigProvider.GetConfig().Entities[entityName]; - await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( - parameters, - entity.Source.Parameters, - _embeddingService, - _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); - } + // Helper handles the case where embed params exist but service is null (throws 503). + Entity entity = _runtimeConfigProvider.GetConfig().Entities[entityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + parameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); SqlExecuteStructure sqlExecuteStructure = new( entityName, @@ -216,15 +214,13 @@ public async Task ExecuteAsync(StoredProcedureRequestContext cont // Phase 3: substitute embed:true parameters with embedding vectors // before SqlExecuteStructure reads them. The helper replaces text values // (e.g., "wireless headphones") with vector JSON strings (e.g., "[0.012,...]"). - if (_embeddingService is not null) - { - Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; - await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( - context.ResolvedParameters, - entity.Source.Parameters, - _embeddingService, - _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); - } + // Helper handles the case where embed params exist but service is null (throws 503). + Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + context.ResolvedParameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); SqlExecuteStructure structure = new( context.EntityName, diff --git a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs index 309b8cf596..a67630fc4d 100644 --- a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs +++ b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs @@ -41,7 +41,7 @@ public static class ParameterEmbeddingHelper public static async Task SubstituteEmbedParametersAsync( IDictionary resolvedParams, List? configParams, - IEmbeddingService embeddingService, + IEmbeddingService? embeddingService, CancellationToken cancellationToken) { // Nothing to do if no config params defined @@ -50,6 +50,29 @@ public static async Task SubstituteEmbedParametersAsync( return; } + // Check upfront: are there any embed params we need to handle? + // If not, return immediately — no need to validate the embedding service. + bool hasEmbedParams = configParams.Any(p => p.Embed); + if (!hasEmbedParams) + { + return; + } + + // If we have embed params but no embedding service, fail loudly. + // This catches edge cases like: + // - Hot-reload disabled embeddings while DAB is running + // - DI misconfiguration (service not registered when embed params exist) + // - Future code paths that construct engines without the service + // Without this check, the silent-skip behavior would send raw text to SQL, + // producing confusing errors or (worst case) silently storing/querying with garbage. + if (embeddingService is null) + { + throw new DataApiBuilderException( + message: "An embed parameter is configured but the embedding service is not available. Verify runtime.embeddings is configured and enabled.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + foreach (ParameterMetadata configParam in configParams) { // Skip non-embed params — they pass through unchanged @@ -66,10 +89,59 @@ public static async Task SubstituteEmbedParametersAsync( continue; } + // Validate: embed parameters must be strings. + // Azure OpenAI's embedding API only accepts strings as input — passing a number, + // boolean, array, or object would either be silently stringified into garbage + // (e.g., "System.Object[]") or rejected by the API with a confusing error. + // + // DAB delivers parameter values as either System.String OR System.Text.Json.JsonElement + // (the JSON parser wraps body values in JsonElement). We accept both string and + // JsonElement-of-kind-String, and reject all other types with a clear 400. + // + // Example FAIL: { "query_vector": 12345 } → JsonElement(Number) → 400 "must be a string" + // Example FAIL: { "query_vector": true } → JsonElement(True) → 400 "must be a string" + // Example FAIL: { "query_vector": ["a","b"] } → JsonElement(Array) → 400 "must be a string" + // Example FAIL: { "query_vector": {"foo":"bar"} } → JsonElement(Object) → 400 "must be a string" + // Example PASS: { "query_vector": "headphones" } → JsonElement(String) or string → proceed + // + // Note: GraphQL is automatically protected since the embed param is exposed as + // GraphQL String type (via Stage 2 metadata override) — non-strings are rejected + // by the GraphQL parser before reaching this code. + string? text = null; + if (value is string s) + { + text = s; + } + else if (value is System.Text.Json.JsonElement jsonElement) + { + if (jsonElement.ValueKind == System.Text.Json.JsonValueKind.String) + { + text = jsonElement.GetString(); + } + else if (jsonElement.ValueKind == System.Text.Json.JsonValueKind.Null) + { + text = null; // Will be caught by IsNullOrWhiteSpace below + } + else + { + throw new DataApiBuilderException( + message: $"Parameter '{configParam.Name}' has 'embed: true' but received a non-string JSON value of kind '{jsonElement.ValueKind}'. Embed parameters must be JSON strings.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); + } + } + else if (value is not null) + { + throw new DataApiBuilderException( + message: $"Parameter '{configParam.Name}' has 'embed: true' but received a non-string value of type '{value.GetType().Name}'. Embed parameters must be JSON strings.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); + } + // Validate: embed params must have non-empty text // Example FAIL: { "query_vector": "" } → 400 error // Example FAIL: { "query_vector": " " } → 400 error - string? text = value?.ToString(); + // Example FAIL: { "query_vector": null } → 400 error (null treated as empty) if (string.IsNullOrWhiteSpace(text)) { throw new DataApiBuilderException( @@ -97,8 +169,13 @@ public static async Task SubstituteEmbedParametersAsync( // // Uses InvariantCulture to prevent European locales from using comma as decimal separator // (e.g., German locale would produce "0,012" instead of "0.012" → invalid JSON) + // + // Uses "R" (round-trip) format specifier instead of "G" (general): + // - "G" defaults to G7 for Single, which is NOT round-trippable (~30% of values lose precision) + // - "R" guarantees the string can be parsed back to the exact same float + // - Embeddings are precision-sensitive — even tiny drift affects cosine similarity scores string vectorJson = "[" - + string.Join(",", result.Embedding.Select(f => f.ToString("G", CultureInfo.InvariantCulture))) + + string.Join(",", result.Embedding.Select(f => f.ToString("R", CultureInfo.InvariantCulture))) + "]"; // Replace the text value with the vector string in-place From 43914770767892994168fe27cfa4447b61f66ec5 Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Wed, 29 Apr 2026 16:59:56 -0700 Subject: [PATCH 04/11] Phase 3 Stage 3.6: Reject non-VECTOR types for embed:true at startup Addresses reviewer concern about silent failure when embed:true is misconfigured on a non-VECTOR parameter. Previously, embed:true on an NVARCHAR parameter would pass validation, bypass the metadata override (since it is not Byte[]), and silently produce empty/wrong results at request time. In MsSqlMetadataProvider.FillSchemaForStoredProcedureAsync: - Restructured the embed:true block to validate type FIRST, then override - If embed:true is configured but the SystemType is not Byte[] (i.e., not a VECTOR-shaped param), throw at startup with clear error message - Catches: NVARCHAR, INT, DATETIME, and other non-VECTOR types Documentation updates: - ParameterMetadata.cs: XML doc warns target sproc param must be VECTOR(N) - dab.draft.schema.json: schema description includes the requirement - MsSqlMetadataProvider.cs: comment explains the check rationale and the remaining VARBINARY edge case (loud SQL error at request time) Verified with real misconfiguration: created test sproc with NVARCHAR(MAX) parameter, marked embed:true in config, DAB fails to start with clear error identifying the procedure, parameter, and the actual type vs expected. Known remaining limitation: VECTOR(N) and varbinary(N) are indistinguishable in INFORMATION_SCHEMA.PARAMETERS. Real varbinary blob misuse would still pass this check but fail at SQL execution with implicit conversion error - loud and clear, so accepted. Documented; sys.parameters-based detection filed as future enhancement. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- schemas/dab.draft.schema.json | 2 +- src/Config/ObjectModel/ParameterMetadata.cs | 7 +++ .../MsSqlMetadataProvider.cs | 54 ++++++++++++++----- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/schemas/dab.draft.schema.json b/schemas/dab.draft.schema.json index 4e60103a0c..c4b9ce9c89 100644 --- a/schemas/dab.draft.schema.json +++ b/schemas/dab.draft.schema.json @@ -1246,7 +1246,7 @@ "required": { "$ref": "#/$defs/boolean-or-string", "description": "Is parameter required" }, "default": { "type": ["string", "number", "boolean", "null"], "description": "Default value" }, "description": { "type": "string", "description": "Parameter description. Since descriptions for multiple parameters are provided as a comma-separated string, individual parameter descriptions must not contain a comma (',')." }, - "embed": { "type": "boolean", "description": "When true, the parameter text is automatically converted to an embedding vector via the configured embedding service before being passed to the stored procedure. Requires runtime.embeddings to be configured. Only valid on stored-procedure entities.", "default": false } + "embed": { "type": "boolean", "description": "When true, the parameter text is automatically converted to an embedding vector via the configured embedding service before being passed to the stored procedure. Requires runtime.embeddings to be configured. Only valid on stored-procedure entities. The target stored procedure parameter must be declared as VECTOR(N) — DAB cannot detect non-VECTOR misconfigurations at startup due to SQL Server metadata limitations.", "default": false } } } } diff --git a/src/Config/ObjectModel/ParameterMetadata.cs b/src/Config/ObjectModel/ParameterMetadata.cs index e087bf7819..f0107e087a 100644 --- a/src/Config/ObjectModel/ParameterMetadata.cs +++ b/src/Config/ObjectModel/ParameterMetadata.cs @@ -29,6 +29,13 @@ public class ParameterMetadata /// When true, the parameter value (text) is automatically embedded via the /// EmbeddingService and the resulting vector is passed to the stored procedure. /// Only valid on stored-procedure entities when runtime.embeddings is configured. + /// + /// IMPORTANT: The target stored procedure parameter must be declared as VECTOR(N). + /// SQL Server's metadata system reports VECTOR(N) and varbinary indistinguishably, + /// so DAB cannot detect this misconfiguration at startup. If embed:true is applied + /// to a non-VECTOR parameter (e.g., NVARCHAR or VARBINARY), the request will fail + /// at runtime with a SQL error or return semantically incorrect results. + /// It is the developer's responsibility to ensure the sproc parameter is VECTOR(N). /// public bool Embed { get; set; } } diff --git a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs index 12064dab48..7e4558e7ae 100644 --- a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs +++ b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs @@ -224,22 +224,52 @@ protected override async Task FillSchemaForStoredProcedureAsync( parameterDefinition.HasConfigDefault = paramMetadata.Default is not null; parameterDefinition.ConfigDefaultValue = paramMetadata.Default?.ToString(); - // For embed:true parameters, override the SQL metadata type. - // SQL Server reports VECTOR(N) as "varbinary" → DAB maps to Byte[] → DbType.Binary. - // This causes ParseParamAsSystemType to try base64 decoding on our vector JSON string. + // Phase 3: validate and override types for embed:true parameters. // - // Fix: override to String/DbType.String so the vector JSON string - // flows through ParseParamAsSystemType's "String" => param path (returns as-is). - // SQL Server auto-casts the NVARCHAR string to VECTOR at execution time. + // SQL Server reports VECTOR(N) as "varbinary" via INFORMATION_SCHEMA.PARAMETERS, + // which DAB maps to typeof(byte[]). We use this Byte[] mapping as our signal + // that the parameter is VECTOR-shaped. If the param is NOT Byte[], the + // developer has misconfigured — embed only makes sense for VECTOR parameters. // - // Guard conditions (ALL must be true): - // 1. This parameter has embed:true in config - // 2. SQL metadata reported it as Byte[] (the VECTOR→varbinary mapping) - // A normal varbinary param (e.g., image blob) is NOT affected — it won't have embed:true. + // Why fail at startup instead of letting it silently break at request time: + // - embed:true on NVARCHAR → vector JSON treated as text → empty/wrong results, no error + // - embed:true on INT/DATETIME → SQL conversion error at request time + // - embed:true on VECTOR(N) → works correctly (SQL reports as Byte[]) + // Catching this at startup gives a clear error message before any user is impacted. // - // Follows the same pattern as the DateTime DbType override above (lines 186-195). - if (paramMetadata.Embed && parameterDefinition.SystemType == typeof(byte[])) + // KNOWN LIMITATION: SQL Server's INFORMATION_SCHEMA.PARAMETERS reports both + // VECTOR(N) and varbinary(N) as the same "varbinary" type. We cannot + // distinguish them here. If a developer mistakenly applies embed:true to + // a real varbinary parameter (e.g., image blob), this override will fire + // and the request will fail at SQL execution with an "implicit conversion + // not allowed" error. That failure is loud and clear, so we accept it. + // Documented in ParameterMetadata.Embed and the JSON schema. + // + // (Detection IS possible via sys.parameters which distinguishes vector + // from varbinary by user_type_id, but adding a separate query for this + // would require infrastructure changes beyond Phase 3 scope. Filed as + // future enhancement.) + // + // Follows the same pattern as the DateTime DbType override above. + if (paramMetadata.Embed) { + // Reject embed:true on non-VECTOR-shaped params at startup with a + // clear error. This catches NVARCHAR/INT/DATETIME misconfigurations + // that would otherwise silently produce wrong results at request time. + if (parameterDefinition.SystemType != typeof(byte[])) + { + throw new DataApiBuilderException( + message: $"Procedure '{schemaName}.{storedProcedureSourceName}': " + + $"parameter '{configParamKey}' has 'embed: true' in config but is " + + $"declared as '{parameterDefinition.SystemType.Name}' in the stored procedure. " + + $"Embed parameters must be declared as VECTOR(N).", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ErrorInInitialization); + } + + // Override the metadata type so the vector JSON string flows through + // DAB's String type pipeline. SQL Server auto-casts NVARCHAR → VECTOR + // at execution time. parameterDefinition.SystemType = typeof(string); parameterDefinition.DbType = System.Data.DbType.String; parameterDefinition.SqlDbType = System.Data.SqlDbType.NVarChar; From f84e143d173ace412df213df22833fc15e7b7c9a Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Wed, 29 Apr 2026 17:05:04 -0700 Subject: [PATCH 05/11] Phase 3 Stage 3.7: Round-2 reviewer polish fixes Addresses Important issues from second-round code review: Fix 1 (G9 vs R for float precision): - Microsoft docs: "For Single values, the R format specifier in some cases fails to successfully round-trip the original value. We recommend that you use the G9 format specifier instead." - Changed ToString("R") to ToString("G9") for guaranteed round-trip - ParameterEmbeddingHelper.cs Fix 2 (Scope 503 to per-request): - Previously: helper threw 503 if entity has any embed param + service is null - Problem: requests that omit optional embed params shouldn't fail just because the embedding service is unavailable - Now: 503 only thrown when this specific request supplies a value for an embed param AND service is null - Removed misleading hot-reload comment (engines hold cached service ref) - ParameterEmbeddingHelper.cs Fix 3 (Validator continue after each rule): - Previously: a single misconfigured embed param could record up to 4 errors in HandleOrRecord-record mode (CLI validate) - Now: each rule fires continue after recording, preventing redundant noise - Last rule (Default) doesn't need continue - RuntimeConfigValidator.cs Fix 4 (Hoist data source lookup): - Previously: GetDataSourceFromEntityName called per-parameter inside loop - Now: called once per entity outside the parameter loop - Cheap fix; obvious code smell removed - RuntimeConfigValidator.cs Verified: all positive and negative tests still pass after these changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Configurations/RuntimeConfigValidator.cs | 9 +++- .../Embeddings/ParameterEmbeddingHelper.cs | 47 +++++++++++-------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index bf14e778bd..a938b12d12 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -448,6 +448,10 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) continue; } + // Hoist data source lookup outside the param loop — it's entity-scoped, not param-scoped. + // Looked up once per entity instead of once per parameter (was duplicated work in Stage 3.5). + DataSource entityDataSource = runtimeConfig.GetDataSourceFromEntityName(entityName); + // Check each parameter for the embed flag. // Example: iterates over { "name": "query_vector", "embed": true } and { "name": "top_k", "default": "5" } foreach (ParameterMetadata param in entity.Source.Parameters) @@ -465,13 +469,13 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) // For PostgreSQL/MySQL/Cosmos, the request would fail at runtime with a confusing // type error. Reject at startup instead. // Example FAIL: PostgreSQL entity with embed:true → "embed feature only supported for MSSQL" - DataSource entityDataSource = runtimeConfig.GetDataSourceFromEntityName(entityName); if (entityDataSource.DatabaseType != DatabaseType.MSSQL) { HandleOrRecordException(new DataApiBuilderException( message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but the data source type is '{entityDataSource.DatabaseType}'. The embed feature is currently only supported for Azure SQL / SQL Server.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + continue; // Don't cascade further rules — this is a fundamental issue } // Rule 1: embed:true is only valid on stored-procedure entities. @@ -485,6 +489,7 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but is only valid on stored-procedure entities.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + continue; // Don't cascade further rules } // Rule 2: embed:true requires runtime.embeddings to be configured and enabled. @@ -498,6 +503,7 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but runtime.embeddings is not configured or not enabled.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + continue; // Don't cascade further rules } // Rule 3: embed:true with a default value is not supported. @@ -511,6 +517,7 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) message: $"Entity '{entityName}': parameter '{param.Name}' has both 'embed: true' and a 'default' value. Embed parameters cannot have default values.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + // No continue needed — this is the last rule } } } diff --git a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs index a67630fc4d..5da24073ba 100644 --- a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs +++ b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs @@ -50,29 +50,13 @@ public static async Task SubstituteEmbedParametersAsync( return; } - // Check upfront: are there any embed params we need to handle? - // If not, return immediately — no need to validate the embedding service. + // Quick exit: are there any embed params at all in config? bool hasEmbedParams = configParams.Any(p => p.Embed); if (!hasEmbedParams) { return; } - // If we have embed params but no embedding service, fail loudly. - // This catches edge cases like: - // - Hot-reload disabled embeddings while DAB is running - // - DI misconfiguration (service not registered when embed params exist) - // - Future code paths that construct engines without the service - // Without this check, the silent-skip behavior would send raw text to SQL, - // producing confusing errors or (worst case) silently storing/querying with garbage. - if (embeddingService is null) - { - throw new DataApiBuilderException( - message: "An embed parameter is configured but the embedding service is not available. Verify runtime.embeddings is configured and enabled.", - statusCode: HttpStatusCode.ServiceUnavailable, - subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); - } - foreach (ParameterMetadata configParam in configParams) { // Skip non-embed params — they pass through unchanged @@ -84,11 +68,32 @@ public static async Task SubstituteEmbedParametersAsync( // Check if the request provided this parameter // If not provided, DAB's existing required-param validation will handle it + // (and an embed:true param without a value in this request doesn't need + // the embedding service — only the params actually being substituted do) if (!resolvedParams.TryGetValue(configParam.Name, out object? value)) { continue; } + // If we have an embed param value to substitute but no embedding service, + // fail loudly. Without this check, the silent-skip behavior would send raw + // text to SQL, producing confusing errors or silently wrong results. + // + // This catches: + // - DI misconfiguration (service not registered when embed params exist) + // - Future code paths that construct engines without the service + // + // Note: This check is scoped to "this request actually has an embed param + // value" — a request that omits the embed param won't fail here, so optional + // embed params don't break when the embedding service is unavailable. + if (embeddingService is null) + { + throw new DataApiBuilderException( + message: $"Parameter '{configParam.Name}' has 'embed: true' but the embedding service is not available. Verify runtime.embeddings is configured and enabled.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + // Validate: embed parameters must be strings. // Azure OpenAI's embedding API only accepts strings as input — passing a number, // boolean, array, or object would either be silently stringified into garbage @@ -170,12 +175,14 @@ public static async Task SubstituteEmbedParametersAsync( // Uses InvariantCulture to prevent European locales from using comma as decimal separator // (e.g., German locale would produce "0,012" instead of "0.012" → invalid JSON) // - // Uses "R" (round-trip) format specifier instead of "G" (general): + // Uses "G9" format specifier (NOT "G" or "R"): // - "G" defaults to G7 for Single, which is NOT round-trippable (~30% of values lose precision) - // - "R" guarantees the string can be parsed back to the exact same float + // - "R" is documented to fail round-trip in some cases for Single + // (per Microsoft docs: "We recommend that you use the G9 format specifier instead.") + // - "G9" guarantees the string can be parsed back to the exact same float // - Embeddings are precision-sensitive — even tiny drift affects cosine similarity scores string vectorJson = "[" - + string.Join(",", result.Embedding.Select(f => f.ToString("R", CultureInfo.InvariantCulture))) + + string.Join(",", result.Embedding.Select(f => f.ToString("G9", CultureInfo.InvariantCulture))) + "]"; // Replace the text value with the vector string in-place From c101a5d396cb4268f13aea80494429cc577a1abe Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Thu, 30 Apr 2026 10:00:13 -0700 Subject: [PATCH 06/11] Phase 3 Stage 3.8: Revert two over-corrections from Stage 3.7 Reverting two changes from Stage 3.7 that, on reflection, made the code worse: Revert 1: Removed the continue statements after each validator rule. - Why reverted: DAB's other validators in RuntimeConfigValidator.cs do NOT use this pattern. They HandleOrRecordException and continue checking all rules. Adding continue here was inconsistent with the codebase. - The reviewer's "noisy errors" concern was an aesthetic preference, not a correctness issue. Collect-all-errors is the established DAB pattern and better UX for users (fix all problems in one pass vs iterate). Revert 2: Moved the embeddingService null check back to the top of the helper. - Why reverted: The per-request scoped check made the failure mode unpredictable. Same DAB instance might fail with 503 sometimes (when embed param supplied) and succeed other times (when omitted). Hard to debug. - The reviewer's "over-broad 503" concern was theoretical. In practice, if embed:true is in your config, the embedding service should be available. Silently working when the service is missing creates a half-broken state that limps along instead of failing clearly. - Original Stage 3.5 behavior (fail upfront if any embed params + no service) is simpler, more predictable, and matches the principle that misconfiguration should fail loud and fast. Kept from Stage 3.7: - G9 vs R for float precision (correct fix, Microsoft-documented) - Hoisted GetDataSourceFromEntityName outside parameter loop (cheap, correct) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Configurations/RuntimeConfigValidator.cs | 4 -- .../Embeddings/ParameterEmbeddingHelper.cs | 37 ++++++++----------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index a938b12d12..c88bd15e40 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -475,7 +475,6 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but the data source type is '{entityDataSource.DatabaseType}'. The embed feature is currently only supported for Azure SQL / SQL Server.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); - continue; // Don't cascade further rules — this is a fundamental issue } // Rule 1: embed:true is only valid on stored-procedure entities. @@ -489,7 +488,6 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but is only valid on stored-procedure entities.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); - continue; // Don't cascade further rules } // Rule 2: embed:true requires runtime.embeddings to be configured and enabled. @@ -503,7 +501,6 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but runtime.embeddings is not configured or not enabled.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); - continue; // Don't cascade further rules } // Rule 3: embed:true with a default value is not supported. @@ -517,7 +514,6 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) message: $"Entity '{entityName}': parameter '{param.Name}' has both 'embed: true' and a 'default' value. Embed parameters cannot have default values.", statusCode: HttpStatusCode.ServiceUnavailable, subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); - // No continue needed — this is the last rule } } } diff --git a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs index 5da24073ba..f5f9d475c3 100644 --- a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs +++ b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs @@ -57,6 +57,22 @@ public static async Task SubstituteEmbedParametersAsync( return; } + // If we have embed params in config but no embedding service, fail loudly. + // This catches DI misconfiguration and future code paths that construct engines + // without the service. Without this check, the silent-skip behavior would send + // raw text to SQL, producing confusing errors or silently wrong results. + // + // Note: Startup config validation already requires runtime.embeddings to be + // configured and enabled when embed:true is present (see ValidateEmbedParameters). + // This is defense-in-depth for unexpected DI states at runtime. + if (embeddingService is null) + { + throw new DataApiBuilderException( + message: "An embed parameter is configured but the embedding service is not available. Verify runtime.embeddings is configured and enabled.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + foreach (ParameterMetadata configParam in configParams) { // Skip non-embed params — they pass through unchanged @@ -68,32 +84,11 @@ public static async Task SubstituteEmbedParametersAsync( // Check if the request provided this parameter // If not provided, DAB's existing required-param validation will handle it - // (and an embed:true param without a value in this request doesn't need - // the embedding service — only the params actually being substituted do) if (!resolvedParams.TryGetValue(configParam.Name, out object? value)) { continue; } - // If we have an embed param value to substitute but no embedding service, - // fail loudly. Without this check, the silent-skip behavior would send raw - // text to SQL, producing confusing errors or silently wrong results. - // - // This catches: - // - DI misconfiguration (service not registered when embed params exist) - // - Future code paths that construct engines without the service - // - // Note: This check is scoped to "this request actually has an embed param - // value" — a request that omits the embed param won't fail here, so optional - // embed params don't break when the embedding service is unavailable. - if (embeddingService is null) - { - throw new DataApiBuilderException( - message: $"Parameter '{configParam.Name}' has 'embed: true' but the embedding service is not available. Verify runtime.embeddings is configured and enabled.", - statusCode: HttpStatusCode.ServiceUnavailable, - subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); - } - // Validate: embed parameters must be strings. // Azure OpenAI's embedding API only accepts strings as input — passing a number, // boolean, array, or object would either be silently stringified into garbage From d32b9afda07d7de19c5a2a639b62455629165485 Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Fri, 1 May 2026 15:03:04 -0700 Subject: [PATCH 07/11] Phase 3 Stage 3.9: Multi-embed batching + clarify embed/default rationale MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two PR review comments from JimRoberts-MS on PR #1: Comment #2 (ParameterEmbeddingHelper.cs:157): "is there a limitation that only one parameter can be configured for embedding at a time? I think we'd want to do batching here to avoid multiple sequential waits on the api" Comment #1 (RuntimeConfigValidator.cs:507): "can this limitation about default values be clarified a bit more?" Changes: 1. ParameterEmbeddingHelper.cs — refactor to use TryEmbedBatchAsync (addresses #2): Restructured the substitution loop into 3 phases: - COLLECT — validate each embed param value, gather (paramName, text) pairs (preserves per-param error specificity for type/null checks) - BATCH — single TryEmbedBatchAsync call instead of N sequential TryEmbedAsync calls (saves ~(N-1) × API_LATENCY on cache miss) - SUBSTITUTE — write each returned vector back into resolvedParams Behavior preserved: - Single-embed-param path is equivalent (batch of 1) - All error status codes unchanged (400 for bad input, 500 for service failure, 503 for missing service) - In-place mutation contract on resolvedParams unchanged - G9 float format preserved Defensive checks added: - Length mismatch between requests and returned embeddings → 500 - Null/empty embedding for any individual param → 500 Type validation extracted into private ExtractTextValue(paramName, value) helper to flatten the nested if/else in the main loop. Verified with 16 manual test cases across REST POST/GET and GraphQL, covering single-embed and multi-embed sprocs, positive and negative inputs. 2. ParameterEmbeddingHelper.cs — fix misleading internal comment (related to #1's surrounding context): The previous comment claimed "DAB's existing required-param validation handles missing required params later" — but DAB's request validation for sprocs only checks for extra fields (not missing ones). The actual mechanism that catches missing required params is the SQL Server error "expects parameter X, which was not supplied", parsed by MsSqlDbExceptionParser into a 400 DatabaseInputError. Updated the comment to describe what actually happens. 3. RuntimeConfigValidator.cs — expand embed/default rule rationale (addresses #1): Replaced the one-line rationale for "Rule 3: embed:true with a default value is not supported" with a multi-paragraph explanation that: - Leads with the conceptual UX point: an embed param represents user input (typically a search query); defaulting it would mean the server fabricates and embeds a query the user never typed. That isn't a sensible fallback. - Notes that defaults on non-embed params of the same sproc remain supported (rule only fires for embed: true params). - Briefly documents why even setting aside the UX concern, supporting embed-defaults would be non-trivial (GraphQL schema literal-baking has no VECTOR type; REST/MCP defaults would be re-embedded every request; embedding-at-startup couples startup to provider availability). - Documents the current observed behavior when a client forgets to supply an embed param (verified empirically): explicit null/empty → 400 BadRequest from the helper; field omitted → 400 DatabaseInputError from SQL via MsSqlDbExceptionParser. Both produce a clear, actionable client error. - Notes that the rule can be lifted later if a real use case emerges. No behavior changes beyond the batching itself. Validator rule unchanged; helper logic for missing-value handling unchanged; error message text unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Configurations/RuntimeConfigValidator.cs | 41 +++- .../Embeddings/ParameterEmbeddingHelper.cs | 216 +++++++++++++----- 2 files changed, 193 insertions(+), 64 deletions(-) diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index c88bd15e40..97242a681f 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -504,7 +504,46 @@ private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) } // Rule 3: embed:true with a default value is not supported. - // A default value is text (e.g., "wireless headphones") that would need embedding at startup — not supported. + // + // An embed parameter represents the user's text input that gets converted + // to a vector at request time — typically a semantic-search query. + // + // Setting a default for an embed parameter would mean: if the client doesn't + // supply a search query, the server invents one (e.g., "wireless headphones"), + // embeds it, and runs a semantic search the user never asked for. That isn't + // a fallback — it's the server fabricating user input. In any real UX, a + // missing search query indicates a client bug or an empty search box, not an + // invitation for the server to substitute a canned query on the user's behalf. + // + // (Defaults on non-embed parameters of the same sproc are unaffected by this + // rule and continue to work as before.) + // + // Even setting aside the UX concern, supporting embed-defaults would be + // non-trivial: + // - GraphQL schema defaults are baked in at startup as typed literals + // (GraphQLStoredProcedureBuilder.ConvertValueToGraphQLType). There is no + // VECTOR literal type in GraphQL, and the literal text would surface in + // introspection as a misleading default value for an embedded parameter. + // - REST/MCP defaults are injected as plain text into the resolved-parameter + // dictionary, then would be re-embedded by ParameterEmbeddingHelper on + // every request — a hidden per-request cost for a value the client never + // sent. + // - Embedding the default once at startup would couple application startup + // to the embedding provider's network availability (validation runs in + // CLI / startup contexts that may not have outbound access). + // + // What happens today if a client forgets to supply an embed parameter: + // - {"query_vector": null} or "" → 400 BadRequest "has 'embed: true' but + // the provided text is empty or whitespace." (caught by ParameterEmbeddingHelper) + // - field omitted entirely → 400 DatabaseInputError "expects parameter + // '@query_vector', which was not supplied." (SQL Server error, parsed + // by MsSqlDbExceptionParser) + // Both produce a clear, actionable client error — no silent failure. + // + // If a real use case for embed-defaults ever emerges, this rule can be lifted + // with the matching runtime support added. For now, embed parameters should + // always be supplied by the client. + // // Example PASS: { "name": "query_vector", "embed": true } (no default) // Example FAIL: { "name": "query_vector", "embed": true, "default": "wireless headphones" } // → Error: "parameter 'query_vector' has both 'embed: true' and a 'default' value. Embed parameters cannot have default values." diff --git a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs index f5f9d475c3..3ef394d81f 100644 --- a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs +++ b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs @@ -3,6 +3,7 @@ using System.Globalization; using System.Net; +using System.Text.Json; using Azure.DataApiBuilder.Config.ObjectModel; using Azure.DataApiBuilder.Service.Exceptions; @@ -25,6 +26,14 @@ public static class ParameterEmbeddingHelper /// For each parameter marked embed:true in config, replaces the text value in /// resolvedParams with a serialized vector string. /// + /// Implementation uses 3 phases for efficiency when a sproc has multiple embed params: + /// 1. COLLECT — validate all embed param values, build a list of texts to embed + /// 2. BATCH — single TryEmbedBatchAsync call instead of N sequential TryEmbedAsync calls + /// 3. SUBSTITUTE — serialize each returned vector and write back into resolvedParams + /// + /// For the common single-embed-param case, batch of 1 is equivalent to sequential. + /// For multi-embed sprocs, this saves ~(N-1) × API_LATENCY on cache miss. + /// /// The embedding call goes through EmbeddingService which has built-in FusionCache, /// so repeated identical texts return instantly without calling the AI provider. /// @@ -73,6 +82,18 @@ public static async Task SubstituteEmbedParametersAsync( subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); } + // ───────────────────────────────────────────────────────────────────── + // PHASE 1: COLLECT — validate each embed param's value and gather texts + // ───────────────────────────────────────────────────────────────────── + // We validate eagerly (per-param) so error messages stay specific: + // "Parameter 'foo' has embed:true but received a non-string value..." + // rather than the batch-level "embed failed for params X, Y, Z." + // + // After this phase: embedRequests has (paramName, text) for every embed + // param the user supplied. Param metadata defines order; we preserve it. + + List<(string paramName, string text)> embedRequests = new(); + foreach (ParameterMetadata configParam in configParams) { // Skip non-embed params — they pass through unchanged @@ -82,66 +103,33 @@ public static async Task SubstituteEmbedParametersAsync( continue; } - // Check if the request provided this parameter - // If not provided, DAB's existing required-param validation will handle it + // Skip embed params not supplied in the request. We don't enforce + // required-ness here because DAB's request validation for sprocs only + // checks for extra fields, not missing ones — see RequestValidator + // .ValidateStoredProcedureRequestContext, which delegates required-param + // detection to SQL Server (no easy way to read default-value metadata + // from sys.parameters in a portable way). + // + // When a required embed param is missing entirely, SqlExecuteStructure + // also won't bind it, and SQL Server returns "expects parameter X, which + // was not supplied." MsSqlDbExceptionParser translates that to a 400 + // DatabaseInputError for the client. So missing values still produce a + // clear, actionable error — just via the SQL-error-relay path rather + // than this helper. + // + // (Explicit null or empty string DOES reach this helper, via the + // IsNullOrWhiteSpace check below — that path produces our own 400 with + // the friendlier "has 'embed: true' but the provided text is empty or + // whitespace" message.) if (!resolvedParams.TryGetValue(configParam.Name, out object? value)) { continue; } - // Validate: embed parameters must be strings. - // Azure OpenAI's embedding API only accepts strings as input — passing a number, - // boolean, array, or object would either be silently stringified into garbage - // (e.g., "System.Object[]") or rejected by the API with a confusing error. - // - // DAB delivers parameter values as either System.String OR System.Text.Json.JsonElement - // (the JSON parser wraps body values in JsonElement). We accept both string and - // JsonElement-of-kind-String, and reject all other types with a clear 400. - // - // Example FAIL: { "query_vector": 12345 } → JsonElement(Number) → 400 "must be a string" - // Example FAIL: { "query_vector": true } → JsonElement(True) → 400 "must be a string" - // Example FAIL: { "query_vector": ["a","b"] } → JsonElement(Array) → 400 "must be a string" - // Example FAIL: { "query_vector": {"foo":"bar"} } → JsonElement(Object) → 400 "must be a string" - // Example PASS: { "query_vector": "headphones" } → JsonElement(String) or string → proceed - // - // Note: GraphQL is automatically protected since the embed param is exposed as - // GraphQL String type (via Stage 2 metadata override) — non-strings are rejected - // by the GraphQL parser before reaching this code. - string? text = null; - if (value is string s) - { - text = s; - } - else if (value is System.Text.Json.JsonElement jsonElement) - { - if (jsonElement.ValueKind == System.Text.Json.JsonValueKind.String) - { - text = jsonElement.GetString(); - } - else if (jsonElement.ValueKind == System.Text.Json.JsonValueKind.Null) - { - text = null; // Will be caught by IsNullOrWhiteSpace below - } - else - { - throw new DataApiBuilderException( - message: $"Parameter '{configParam.Name}' has 'embed: true' but received a non-string JSON value of kind '{jsonElement.ValueKind}'. Embed parameters must be JSON strings.", - statusCode: HttpStatusCode.BadRequest, - subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); - } - } - else if (value is not null) - { - throw new DataApiBuilderException( - message: $"Parameter '{configParam.Name}' has 'embed: true' but received a non-string value of type '{value.GetType().Name}'. Embed parameters must be JSON strings.", - statusCode: HttpStatusCode.BadRequest, - subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); - } + // Validate type and extract text (throws 400 for non-strings) + string? text = ExtractTextValue(configParam.Name, value); - // Validate: embed params must have non-empty text - // Example FAIL: { "query_vector": "" } → 400 error - // Example FAIL: { "query_vector": " " } → 400 error - // Example FAIL: { "query_vector": null } → 400 error (null treated as empty) + // Validate non-empty (throws 400 for empty/whitespace/null) if (string.IsNullOrWhiteSpace(text)) { throw new DataApiBuilderException( @@ -150,16 +138,61 @@ public static async Task SubstituteEmbedParametersAsync( subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); } - // Call EmbeddingService to convert text → float[1536] - // The service has built-in FusionCache (L1 + optional L2 Redis): - // First call for "wireless headphones" → calls Azure OpenAI API (~200-500ms) - // Second call for same text → cache hit, returns instantly - EmbeddingResult result = await embeddingService.TryEmbedAsync(text, cancellationToken); + embedRequests.Add((configParam.Name, text!)); + } + + // No embed param values supplied in this request — nothing to embed + if (embedRequests.Count == 0) + { + return; + } + + // ───────────────────────────────────────────────────────────────────── + // PHASE 2: BATCH — single API call for all texts at once + // ───────────────────────────────────────────────────────────────────── + // EmbeddingService.TryEmbedBatchAsync (Phase 1 infra) does its own per-text + // FusionCache check. Texts that hit the cache don't trigger API calls; + // only uncached texts go to Azure OpenAI in a single batched request. + // + // For 1 text: equivalent to TryEmbedAsync (no overhead). + // For N texts: 1 API call instead of N sequential ones (saves ~(N-1) × latency). + + string[] texts = embedRequests.Select(r => r.text).ToArray(); + EmbeddingBatchResult batchResult = await embeddingService.TryEmbedBatchAsync(texts, cancellationToken); + + if (!batchResult.Success || batchResult.Embeddings is null) + { + // Batch failure: we lose per-param error specificity here, but the + // batch result's ErrorMessage typically explains the underlying issue. + // Naming all involved params helps the user identify the request context. + string paramNames = string.Join(", ", embedRequests.Select(r => $"'{r.paramName}'")); + throw new DataApiBuilderException( + message: $"Failed to generate embeddings for parameter(s) {paramNames}.", + statusCode: HttpStatusCode.InternalServerError, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } - if (!result.Success || result.Embedding is null) + // Defensive: TryEmbedBatchAsync should return one embedding per input text + if (batchResult.Embeddings.Length != embedRequests.Count) + { + throw new DataApiBuilderException( + message: $"Embedding service returned {batchResult.Embeddings.Length} embeddings but {embedRequests.Count} were requested.", + statusCode: HttpStatusCode.InternalServerError, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + + // ───────────────────────────────────────────────────────────────────── + // PHASE 3: SUBSTITUTE — serialize each vector and write back into resolvedParams + // ───────────────────────────────────────────────────────────────────── + // Order is preserved: embedRequests[i] corresponds to batchResult.Embeddings[i]. + + for (int i = 0; i < embedRequests.Count; i++) + { + float[] embedding = batchResult.Embeddings[i]; + if (embedding is null || embedding.Length == 0) { throw new DataApiBuilderException( - message: $"Failed to generate embedding for parameter '{configParam.Name}'.", + message: $"Embedding service returned an empty vector for parameter '{embedRequests[i].paramName}'.", statusCode: HttpStatusCode.InternalServerError, subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); } @@ -177,13 +210,70 @@ public static async Task SubstituteEmbedParametersAsync( // - "G9" guarantees the string can be parsed back to the exact same float // - Embeddings are precision-sensitive — even tiny drift affects cosine similarity scores string vectorJson = "[" - + string.Join(",", result.Embedding.Select(f => f.ToString("G9", CultureInfo.InvariantCulture))) + + string.Join(",", embedding.Select(f => f.ToString("G9", CultureInfo.InvariantCulture))) + "]"; // Replace the text value with the vector string in-place // SqlExecuteStructure will see this as a String value (thanks to metadata override) // and pass it through to SQL, which auto-casts to VECTOR(N) - resolvedParams[configParam.Name] = vectorJson; + resolvedParams[embedRequests[i].paramName] = vectorJson; + } + } + + /// + /// Validates that the parameter value is a string (or JsonElement of kind String) and extracts it. + /// + /// Azure OpenAI's embedding API only accepts strings as input — passing a number, + /// boolean, array, or object would either be silently stringified into garbage + /// (e.g., "System.Object[]") or rejected by the API with a confusing error. + /// + /// DAB delivers parameter values as either System.String OR System.Text.Json.JsonElement + /// (the JSON parser wraps body values in JsonElement). We accept both string and + /// JsonElement-of-kind-String, and reject all other types with a clear 400. + /// + /// Example FAIL: { "query_vector": 12345 } → JsonElement(Number) → 400 "must be a string" + /// Example FAIL: { "query_vector": true } → JsonElement(True) → 400 "must be a string" + /// Example FAIL: { "query_vector": ["a","b"] } → JsonElement(Array) → 400 "must be a string" + /// Example FAIL: { "query_vector": {"foo":"bar"} } → JsonElement(Object) → 400 "must be a string" + /// Example PASS: { "query_vector": "headphones" } → JsonElement(String) or string → returns text + /// + /// Note: GraphQL is automatically protected since the embed param is exposed as + /// GraphQL String type (via Stage 2 metadata override) — non-strings are rejected + /// by the GraphQL parser before reaching this code. + /// + private static string? ExtractTextValue(string paramName, object? value) + { + if (value is string s) + { + return s; + } + + if (value is JsonElement jsonElement) + { + if (jsonElement.ValueKind == JsonValueKind.String) + { + return jsonElement.GetString(); + } + + if (jsonElement.ValueKind == JsonValueKind.Null) + { + return null; // Will be caught by IsNullOrWhiteSpace in caller + } + + throw new DataApiBuilderException( + message: $"Parameter '{paramName}' has 'embed: true' but received a non-string JSON value of kind '{jsonElement.ValueKind}'. Embed parameters must be JSON strings.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); + } + + if (value is not null) + { + throw new DataApiBuilderException( + message: $"Parameter '{paramName}' has 'embed: true' but received a non-string value of type '{value.GetType().Name}'. Embed parameters must be JSON strings.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); } + + return null; // null value will be caught by IsNullOrWhiteSpace in caller } } From df98f6d0e41527f5fa899e65b87dad0b5263089b Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Mon, 4 May 2026 12:37:27 -0700 Subject: [PATCH 08/11] Phase 3 Stage 4.1: Refactors enabling Phase 3 unit-testability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Behavior-preserving refactors to make Phase 3's previously-untestable internal logic accessible to the test project. No production behavior changes. Three changes: 1. src/Core/Azure.DataApiBuilder.Core.csproj — add InternalsVisibleTo New ItemGroup: Lets the test project directly invoke 'internal' members of Core. This is a new pattern in DAB (no existing usages of InternalsVisibleTo). Adopted intentionally so we can test implementation-detail helpers without expanding the production public API surface. 2. src/Core/Configurations/RuntimeConfigValidator.cs — visibility change ValidateEmbedParameters: private → internal Now testable from Service.Tests via the InternalsVisibleTo bridge above. Added an XML block explaining why it's internal rather than private, and noting that external callers should still go through ValidateConfigProperties. 3. src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs — extract helper Pulled the inline embed type override + non-VECTOR rejection logic out of FillSchemaForStoredProcedureAsync into a new internal static method: internal static void ApplyEmbedTypeOverride( ParameterDefinition parameterDefinition, ParameterMetadata paramMetadata, string schemaName, string storedProcedureName, string parameterName) The body is byte-for-byte the same logic that was previously inline (early-returns when Embed is false; throws if not Byte[]; otherwise overrides SystemType/DbType/SqlDbType). All explanatory comments preserved on the helper as XML . The original call site is replaced with a single call to ApplyEmbedTypeOverride. Helper is internal static so tests can construct ParameterDefinition instances and invoke it directly without needing a real metadata provider or DB connection. Why this commit lands separately from the actual tests ------------------------------------------------------ These three changes are pure refactors with no test code yet. Stage 4.2-4.4 add the new tests that depend on these refactors. Splitting the refactors into their own commit keeps each commit focused and bisect-friendly: this commit can be reverted independently if a future refactor needs to reorganize the helper without disturbing test code. Verification ------------ - dotnet build src/Core/Azure.DataApiBuilder.Core.csproj -c Release → Build succeeded. 0 Warning(s). 0 Error(s). - dotnet build src/Service.Tests/Azure.DataApiBuilder.Service.Tests.csproj -c Release → Build succeeded. 0 Warning(s). 0 Error(s). (Confirms InternalsVisibleTo is honored — test project still builds cleanly with no test code yet referencing the internals.) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Core/Azure.DataApiBuilder.Core.csproj | 4 + .../Configurations/RuntimeConfigValidator.cs | 7 +- .../MsSqlMetadataProvider.cs | 144 ++++++++++++------ 3 files changed, 105 insertions(+), 50 deletions(-) diff --git a/src/Core/Azure.DataApiBuilder.Core.csproj b/src/Core/Azure.DataApiBuilder.Core.csproj index 8b32a8682c..d4990a9cbc 100644 --- a/src/Core/Azure.DataApiBuilder.Core.csproj +++ b/src/Core/Azure.DataApiBuilder.Core.csproj @@ -58,6 +58,10 @@ + + + + diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index 97242a681f..8fcdb5fd47 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -428,7 +428,12 @@ public void ValidateEmbeddingsOptions(RuntimeConfig runtimeConfig) /// Validates that parameters with embed=true are only used on stored-procedure entities /// and that runtime.embeddings is configured when embed parameters are present. /// - private void ValidateEmbedParameters(RuntimeConfig runtimeConfig) + /// + /// Internal (rather than private) to allow direct unit testing via the + /// InternalsVisibleTo attribute on Azure.DataApiBuilder.Core. Callers outside + /// the assembly should still go through . + /// + internal void ValidateEmbedParameters(RuntimeConfig runtimeConfig) { // Check once whether the embedding service is configured and enabled. // Example: "runtime": { "embeddings": { "enabled": true, "provider": "azure-openai" } } → true diff --git a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs index 7e4558e7ae..583c59430a 100644 --- a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs +++ b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs @@ -225,55 +225,15 @@ protected override async Task FillSchemaForStoredProcedureAsync( parameterDefinition.ConfigDefaultValue = paramMetadata.Default?.ToString(); // Phase 3: validate and override types for embed:true parameters. - // - // SQL Server reports VECTOR(N) as "varbinary" via INFORMATION_SCHEMA.PARAMETERS, - // which DAB maps to typeof(byte[]). We use this Byte[] mapping as our signal - // that the parameter is VECTOR-shaped. If the param is NOT Byte[], the - // developer has misconfigured — embed only makes sense for VECTOR parameters. - // - // Why fail at startup instead of letting it silently break at request time: - // - embed:true on NVARCHAR → vector JSON treated as text → empty/wrong results, no error - // - embed:true on INT/DATETIME → SQL conversion error at request time - // - embed:true on VECTOR(N) → works correctly (SQL reports as Byte[]) - // Catching this at startup gives a clear error message before any user is impacted. - // - // KNOWN LIMITATION: SQL Server's INFORMATION_SCHEMA.PARAMETERS reports both - // VECTOR(N) and varbinary(N) as the same "varbinary" type. We cannot - // distinguish them here. If a developer mistakenly applies embed:true to - // a real varbinary parameter (e.g., image blob), this override will fire - // and the request will fail at SQL execution with an "implicit conversion - // not allowed" error. That failure is loud and clear, so we accept it. - // Documented in ParameterMetadata.Embed and the JSON schema. - // - // (Detection IS possible via sys.parameters which distinguishes vector - // from varbinary by user_type_id, but adding a separate query for this - // would require infrastructure changes beyond Phase 3 scope. Filed as - // future enhancement.) - // - // Follows the same pattern as the DateTime DbType override above. - if (paramMetadata.Embed) - { - // Reject embed:true on non-VECTOR-shaped params at startup with a - // clear error. This catches NVARCHAR/INT/DATETIME misconfigurations - // that would otherwise silently produce wrong results at request time. - if (parameterDefinition.SystemType != typeof(byte[])) - { - throw new DataApiBuilderException( - message: $"Procedure '{schemaName}.{storedProcedureSourceName}': " + - $"parameter '{configParamKey}' has 'embed: true' in config but is " + - $"declared as '{parameterDefinition.SystemType.Name}' in the stored procedure. " + - $"Embed parameters must be declared as VECTOR(N).", - statusCode: HttpStatusCode.ServiceUnavailable, - subStatusCode: DataApiBuilderException.SubStatusCodes.ErrorInInitialization); - } - - // Override the metadata type so the vector JSON string flows through - // DAB's String type pipeline. SQL Server auto-casts NVARCHAR → VECTOR - // at execution time. - parameterDefinition.SystemType = typeof(string); - parameterDefinition.DbType = System.Data.DbType.String; - parameterDefinition.SqlDbType = System.Data.SqlDbType.NVarChar; - } + // Logic extracted into a testable helper. See ApplyEmbedTypeOverride + // for the full rationale (Byte[] is our VECTOR(N) signal, override + // applies the String type pipeline, etc.). + ApplyEmbedTypeOverride( + parameterDefinition, + paramMetadata, + schemaName: schemaName, + storedProcedureName: storedProcedureSourceName, + parameterName: configParamKey); } } } @@ -282,6 +242,92 @@ protected override async Task FillSchemaForStoredProcedureAsync( GraphQLStoredProcedureExposedNameToEntityNameMap.TryAdd(GenerateStoredProcedureGraphQLFieldName(entityName, procedureEntity), entityName); } + /// + /// Phase 3: For a stored procedure parameter declared with embed: true in config, + /// validates that the parameter is VECTOR-shaped (reported by SQL Server as Byte[]) + /// and overrides its metadata type so the embedding-substituted JSON string flows through + /// DAB's String type pipeline at request time. SQL Server auto-casts NVARCHAR → VECTOR + /// at execution time. + /// + /// + /// The parameter definition to mutate. On entry, its SystemType reflects the SQL + /// Server-reported type (typeof(byte[]) for VECTOR(N) params). On exit, if + /// .Embed is true and the type was Byte[], the SystemType, + /// DbType, and SqlDbType are overridden to String / String / NVarChar respectively. + /// + /// The config-side metadata; embed flag determines behavior. + /// Schema name (e.g. "dbo"), for error messages. + /// Stored-procedure name, for error messages. + /// Parameter name (without leading @), for error messages. + /// + /// Thrown at startup with HTTP 503 if embed: true is set but the parameter is not + /// VECTOR-shaped (i.e., not Byte[]). Catches NVARCHAR/INT/DATETIME misconfigurations + /// before they cause silent wrong results or confusing errors at request time. + /// + /// + /// SQL Server reports VECTOR(N) as "varbinary" via INFORMATION_SCHEMA.PARAMETERS, + /// which DAB maps to typeof(byte[]). We use this Byte[] mapping as our signal + /// that the parameter is VECTOR-shaped. If the param is NOT Byte[], the + /// developer has misconfigured — embed only makes sense for VECTOR parameters. + /// + /// Why fail at startup instead of letting it silently break at request time: + /// - embed:true on NVARCHAR → vector JSON treated as text → empty/wrong results, no error + /// - embed:true on INT/DATETIME → SQL conversion error at request time + /// - embed:true on VECTOR(N) → works correctly (SQL reports as Byte[]) + /// Catching this at startup gives a clear error message before any user is impacted. + /// + /// KNOWN LIMITATION: SQL Server's INFORMATION_SCHEMA.PARAMETERS reports both + /// VECTOR(N) and varbinary(N) as the same "varbinary" type. We cannot + /// distinguish them here. If a developer mistakenly applies embed:true to + /// a real varbinary parameter (e.g., image blob), this override will fire + /// and the request will fail at SQL execution with an "implicit conversion + /// not allowed" error. That failure is loud and clear, so we accept it. + /// Documented in ParameterMetadata.Embed and the JSON schema. + /// + /// (Detection IS possible via sys.parameters which distinguishes vector + /// from varbinary by user_type_id, but adding a separate query for this + /// would require infrastructure changes beyond Phase 3 scope. Filed as + /// future enhancement.) + /// + /// Follows the same pattern as the DateTime DbType override above. + /// + /// Internal static (rather than private inline) to enable direct unit testing + /// via the InternalsVisibleTo attribute on Azure.DataApiBuilder.Core. + /// + internal static void ApplyEmbedTypeOverride( + ParameterDefinition parameterDefinition, + ParameterMetadata paramMetadata, + string schemaName, + string storedProcedureName, + string parameterName) + { + if (!paramMetadata.Embed) + { + return; + } + + // Reject embed:true on non-VECTOR-shaped params at startup with a + // clear error. This catches NVARCHAR/INT/DATETIME misconfigurations + // that would otherwise silently produce wrong results at request time. + if (parameterDefinition.SystemType != typeof(byte[])) + { + throw new DataApiBuilderException( + message: $"Procedure '{schemaName}.{storedProcedureName}': " + + $"parameter '{parameterName}' has 'embed: true' in config but is " + + $"declared as '{parameterDefinition.SystemType.Name}' in the stored procedure. " + + $"Embed parameters must be declared as VECTOR(N).", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ErrorInInitialization); + } + + // Override the metadata type so the vector JSON string flows through + // DAB's String type pipeline. SQL Server auto-casts NVARCHAR → VECTOR + // at execution time. + parameterDefinition.SystemType = typeof(string); + parameterDefinition.DbType = System.Data.DbType.String; + parameterDefinition.SqlDbType = System.Data.SqlDbType.NVarChar; + } + /// protected override void PopulateMetadataForLinkingObject( string entityName, From cec1773ee57ef5a376e8c24078db38c453ad17ab Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Mon, 4 May 2026 14:57:32 -0700 Subject: [PATCH 09/11] Phase 3 Stage 4.2: Add ParameterEmbeddingHelper unit tests (28 tests) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the first formal automated test coverage for Phase 3 production code. New file: src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs 28 tests covering ParameterEmbeddingHelper.SubstituteEmbedParametersAsync, organized into 7 #region blocks (matching the EmbeddingServiceTests.cs "#region Batch Embedding Tests" pattern): #region No-Op Cases (4 tests) - NullConfigParams_ReturnsImmediately_NoServiceCall - NoEmbedParams_ReturnsImmediately_NoServiceCall - EmbedParamsConfiguredButNoneSupplied_ReturnsAfterCollect_NoServiceCall - EmptyConfigParamsList_ReturnsImmediately_NoServiceCall #region Service Availability (2 tests) - NullService_WithEmbedParams_Throws503 (defense-in-depth) - NullService_WithoutEmbedParams_NoThrow (backward compat) #region Input Type Validation (8 tests) - PlainString_AcceptsAndEmbeds - JsonElementString_AcceptsAndEmbeds - JsonElementNumber_Throws400 - JsonElementBoolean_Throws400 - JsonElementArray_Throws400 - JsonElementObject_Throws400 - JsonElementNull_Throws400AsEmpty - NonStringNonJsonElementValue_Throws400 #region Empty And Whitespace Validation (3 tests) - EmptyString_Throws400 - WhitespaceString_Throws400 - NullValue_Throws400 #region Batching Behavior (5 tests) — covers Jim review comment #2 - SingleEmbedParam_CallsBatchOnce_NotSequential - MultipleEmbedParams_CallsBatchOnce_NotSequential ← key batching guarantee - MixedEmbedAndNonEmbed_OnlyEmbedTextsBatched - MultipleEmbedParams_OrderPreserved_BatchTextsMatchConfigOrder - PartiallySuppliedEmbedParams_BatchesSubsetOnly #region Batch Result Handling (4 tests) - BatchSuccess_VectorsSubstitutedInResolvedParams - BatchFailure_Throws500_WithAllParamNames - BatchLengthMismatch_Throws500 - IndividualEmbeddingEmpty_Throws500_NamingFailedParam #region Output Format And Cancellation (2 tests) - VectorJson_UsesG9AndInvariantCulture ← validates locale-independent G9 float serialization - CancellationToken_ForwardedToEmbeddingService Implementation notes -------------------- - Mocking pattern: Mock (Strict). Each test sets up the expected batch call and verifies post-conditions on resolvedParams. No database, no DI container — pure unit tests. - Helper factories at top of class (EmbedParam, NormalParam, JsonElementFrom, SetupBatch, VerifyBatchedExactlyOnce) keep individual test bodies focused on the behavior being tested. - JsonElement construction uses JsonDocument.Parse(...).RootElement.Clone() so the parsed element survives after the source document is disposed. - Float values used in expected-output assertions are powers of 1/2 (0.5, 0.25, 0.125) which are exactly representable in binary float and round-trip through G9 to the same string representation. The "G9 + InvariantCulture" test specifically uses non-exact values (0.1f, -0.2f, 0.0001234567f) and asserts on parsed-back values rather than exact strings to verify precision and locale independence. - `#nullable enable` directive at the top of the file is required because the helper signature uses IDictionary and the test project is set to `disable` globally. Per-file enable is the smallest scoped change that lets us match the helper's signature without introducing project-wide nullable warnings. Test results ------------ - dotnet build src/Service.Tests/Azure.DataApiBuilder.Service.Tests.csproj -c Release → Build succeeded. 0 Warning(s). 0 Error(s). - dotnet test --filter "FullyQualifiedName~ParameterEmbeddingHelperTests" → Total tests: 28. Passed: 28. Failed: 0. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ParameterEmbeddingHelperTests.cs | 749 ++++++++++++++++++ 1 file changed, 749 insertions(+) create mode 100644 src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs diff --git a/src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs b/src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs new file mode 100644 index 0000000000..8f58f9e2d5 --- /dev/null +++ b/src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs @@ -0,0 +1,749 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#nullable enable + +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Net; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Azure.DataApiBuilder.Config.ObjectModel; +using Azure.DataApiBuilder.Core.Services.Embeddings; +using Azure.DataApiBuilder.Service.Exceptions; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using Moq; + +namespace Azure.DataApiBuilder.Service.Tests.UnitTests; + +/// +/// Unit tests for . +/// +/// Covers all behaviors of the helper that converts text-valued sproc parameters +/// (declared with embed: true in config) into vector JSON strings via a +/// mocked . +/// +/// Test groups (organized into #region blocks below): +/// 1. No-Op Cases — early-exit paths that never touch the service +/// 2. Service Availability — null IEmbeddingService handling +/// 3. Input Type Validation — accepted vs rejected value types +/// 4. Empty And Whitespace — empty / whitespace / null text rejection +/// 5. Batching Behavior — proves single batch call (not N sequential) +/// 6. Batch Result Handling — success, failure, length mismatch, null vector +/// 7. Output Format And Cancellation — G9 + InvariantCulture; cancellation token forwarding +/// +[TestClass] +public class ParameterEmbeddingHelperTests +{ + private Mock _mockService = null!; + + [TestInitialize] + public void Setup() + { + _mockService = new Mock(MockBehavior.Strict); + } + + // ───────────────────────────────────────────────────────────────────── + // Helper factories — keep test bodies focused on what's being tested + // ───────────────────────────────────────────────────────────────────── + + private static ParameterMetadata EmbedParam(string name) => + new() { Name = name, Embed = true }; + + private static ParameterMetadata NormalParam(string name) => + new() { Name = name, Embed = false }; + + /// + /// Parses a JSON literal into a standalone that survives + /// after the source is disposed (via Clone). + /// + private static JsonElement JsonElementFrom(string json) + { + using JsonDocument doc = JsonDocument.Parse(json); + return doc.RootElement.Clone(); + } + + /// + /// Configures the mock to return a successful batch result with the given embeddings. + /// + private void SetupBatch(params float[][] embeddings) + { + _mockService + .Setup(s => s.TryEmbedBatchAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new EmbeddingBatchResult(true, embeddings, null)); + } + + /// + /// Verifies the batch method was called exactly once AND that single-text TryEmbedAsync + /// was never called (proves we use the batched path, not sequential per-param calls). + /// + private void VerifyBatchedExactlyOnce() + { + _mockService.Verify( + s => s.TryEmbedBatchAsync(It.IsAny(), It.IsAny()), + Times.Once); + _mockService.Verify( + s => s.TryEmbedAsync(It.IsAny(), It.IsAny()), + Times.Never); + } + + #region No-Op Cases + + /// + /// When configParams is null (entity has no <parameters> section in config), + /// the helper should return immediately without calling the embedding service. + /// + [TestMethod] + public async Task NullConfigParams_ReturnsImmediately_NoServiceCall() + { + Dictionary resolvedParams = new() { { "x", "anything" } }; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams: null, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + Assert.AreEqual("anything", resolvedParams["x"]); + } + + /// + /// When configParams contains parameters but none have embed:true, the helper + /// should return without calling the embedding service. + /// + [TestMethod] + public async Task NoEmbedParams_ReturnsImmediately_NoServiceCall() + { + Dictionary resolvedParams = new() { { "x", "value" }, { "y", 42 } }; + List configParams = new() { NormalParam("x"), NormalParam("y") }; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + Assert.AreEqual("value", resolvedParams["x"]); + Assert.AreEqual(42, resolvedParams["y"]); + } + + /// + /// When embed params are configured but none are supplied in the request, the + /// helper should reach the early-return after Phase 1 collect (embedRequests.Count == 0) + /// without calling the service. + /// + [TestMethod] + public async Task EmbedParamsConfiguredButNoneSupplied_ReturnsAfterCollect_NoServiceCall() + { + Dictionary resolvedParams = new() { { "other", 5 } }; + List configParams = new() { EmbedParam("missing"), NormalParam("other") }; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + Assert.AreEqual(5, resolvedParams["other"]); + } + + /// + /// When configParams is an empty list, the helper should return immediately + /// (the "any embed param?" check exits before any iteration). + /// + [TestMethod] + public async Task EmptyConfigParamsList_ReturnsImmediately_NoServiceCall() + { + Dictionary resolvedParams = new() { { "x", "value" } }; + List configParams = new(); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + } + + #endregion + + #region Service Availability + + /// + /// When the embedding service is null AND embed params are configured, the helper + /// should throw a 503 immediately (defense-in-depth check beyond config validation). + /// + [TestMethod] + public async Task NullService_WithEmbedParams_Throws503() + { + Dictionary resolvedParams = new() { { "q", "hello" } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, embeddingService: null, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + StringAssert.Contains(ex.Message, "embed parameter"); + StringAssert.Contains(ex.Message, "embedding service"); + } + + /// + /// When the embedding service is null but no embed params are configured, the helper + /// should NOT throw — the no-op early exit fires before the null-service check. + /// (Backward compat: existing entities without embed params work without an embedding service.) + /// + [TestMethod] + public async Task NullService_WithoutEmbedParams_NoThrow() + { + Dictionary resolvedParams = new() { { "x", "value" } }; + List configParams = new() { NormalParam("x") }; + + // Should not throw + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, embeddingService: null, CancellationToken.None); + + Assert.AreEqual("value", resolvedParams["x"]); + } + + #endregion + + #region Input Type Validation + + /// + /// A plain System.String value is accepted directly and embedded. + /// + [TestMethod] + public async Task PlainString_AcceptsAndEmbeds() + { + Dictionary resolvedParams = new() { { "q", "wireless headphones" } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.5f, 0.25f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 1 && arr[0] == "wireless headphones"), + It.IsAny()), + Times.Once); + Assert.AreEqual("[0.5,0.25]", resolvedParams["q"]); + } + + /// + /// A JsonElement of kind String (the JSON parser's representation of body strings) + /// is accepted and embedded. + /// + [TestMethod] + public async Task JsonElementString_AcceptsAndEmbeds() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("\"hello\"") } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.5f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 1 && arr[0] == "hello"), + It.IsAny()), + Times.Once); + Assert.AreEqual("[0.5]", resolvedParams["q"]); + } + + /// + /// A JsonElement of kind Number (e.g., the user sent a number where text was expected) + /// must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementNumber_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("12345") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Number"); + StringAssert.Contains(ex.Message, "embed: true"); + } + + /// + /// A JsonElement of kind True/False (boolean) must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementBoolean_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("true") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "True"); + } + + /// + /// A JsonElement of kind Array must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementArray_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("[\"a\",\"b\"]") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Array"); + } + + /// + /// A JsonElement of kind Object must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementObject_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("{\"foo\":\"bar\"}") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Object"); + } + + /// + /// A JsonElement of kind Null falls through to the IsNullOrWhiteSpace check and + /// is rejected as empty/whitespace (400). The helper does not throw a different + /// "non-string kind" message for Null since null is semantically "no value supplied". + /// + [TestMethod] + public async Task JsonElementNull_Throws400AsEmpty() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("null") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + /// + /// A non-string, non-JsonElement value (e.g., a boxed int) must be rejected + /// with a 400 BadRequest naming the offending CLR type. + /// + [TestMethod] + public async Task NonStringNonJsonElementValue_Throws400() + { + Dictionary resolvedParams = new() { { "q", (object)42 } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Int32"); + } + + #endregion + + #region Empty And Whitespace Validation + + /// + /// An explicit empty string value for an embed parameter is rejected with a 400. + /// + [TestMethod] + public async Task EmptyString_Throws400() + { + Dictionary resolvedParams = new() { { "q", "" } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + /// + /// A whitespace-only string value for an embed parameter is rejected with a 400. + /// (IsNullOrWhiteSpace covers strings of only spaces, tabs, etc.) + /// + [TestMethod] + public async Task WhitespaceString_Throws400() + { + Dictionary resolvedParams = new() { { "q", " " } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + /// + /// A C# null value (not JsonElement Null) for an embed parameter is rejected with a 400. + /// ExtractTextValue returns null for a null input; the IsNullOrWhiteSpace check then fires. + /// + [TestMethod] + public async Task NullValue_Throws400() + { + Dictionary resolvedParams = new() { { "q", null } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + #endregion + + #region Batching Behavior + + /// + /// A single embed param results in a single batched call (with one text), NOT a + /// per-param TryEmbedAsync. Even single-param case goes through TryEmbedBatchAsync. + /// + [TestMethod] + public async Task SingleEmbedParam_CallsBatchOnce_NotSequential() + { + Dictionary resolvedParams = new() { { "q", "hello" } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.5f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + VerifyBatchedExactlyOnce(); + // Verify the user's text was replaced with the serialized vector JSON + Assert.AreEqual("[0.5]", resolvedParams["q"]); + } + + /// + /// Multiple embed params (all supplied) result in a SINGLE batched call carrying + /// all texts at once — not N sequential per-param calls. This is the core batching + /// guarantee that addresses Jim's review comment #2. + /// + [TestMethod] + public async Task MultipleEmbedParams_CallsBatchOnce_NotSequential() + { + Dictionary resolvedParams = new() + { + { "p1", "alpha" }, + { "p2", "beta" }, + { "p3", "gamma" } + }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2"), EmbedParam("p3") }; + SetupBatch( + new[] { 0.5f }, new[] { 0.25f }, new[] { 0.125f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + // Exactly one batched call carrying all 3 texts (not 3 sequential calls) + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 3), + It.IsAny()), + Times.Once); + _mockService.Verify( + s => s.TryEmbedAsync(It.IsAny(), It.IsAny()), + Times.Never); + + // Verify each user text was replaced with its corresponding serialized vector + Assert.AreEqual("[0.5]", resolvedParams["p1"]); + Assert.AreEqual("[0.25]", resolvedParams["p2"]); + Assert.AreEqual("[0.125]", resolvedParams["p3"]); + } + + /// + /// When configParams contains a mix of embed:true and embed:false params, only the + /// embed:true ones are batched. Non-embed params pass through untouched. + /// + [TestMethod] + public async Task MixedEmbedAndNonEmbed_OnlyEmbedTextsBatched() + { + Dictionary resolvedParams = new() + { + { "embedA", "search text 1" }, + { "normal1", 5 }, + { "embedB", "search text 2" }, + { "normal2", "literal" } + }; + List configParams = new() + { + EmbedParam("embedA"), + NormalParam("normal1"), + EmbedParam("embedB"), + NormalParam("normal2") + }; + SetupBatch(new[] { 0.5f }, new[] { 0.25f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 2 && arr[0] == "search text 1" && arr[1] == "search text 2"), + It.IsAny()), + Times.Once); + + // Non-embed params pass through untouched + Assert.AreEqual(5, resolvedParams["normal1"]); + Assert.AreEqual("literal", resolvedParams["normal2"]); + // Embed params replaced with vector JSON + Assert.AreEqual("[0.5]", resolvedParams["embedA"]); + Assert.AreEqual("[0.25]", resolvedParams["embedB"]); + } + + /// + /// When configParams has multiple embed entries [A, B, C], the texts in the batch + /// call must be in that same order — the Substitute phase relies on index alignment + /// between the request list and the response list. + /// + [TestMethod] + public async Task MultipleEmbedParams_OrderPreserved_BatchTextsMatchConfigOrder() + { + Dictionary resolvedParams = new() + { + { "C", "third" }, + { "A", "first" }, + { "B", "second" } + }; + // configParams declared in order A, B, C — this is the order that should be preserved. + List configParams = new() { EmbedParam("A"), EmbedParam("B"), EmbedParam("C") }; + SetupBatch(new[] { 0.5f }, new[] { 0.25f }, new[] { 0.125f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => + arr.Length == 3 && arr[0] == "first" && arr[1] == "second" && arr[2] == "third"), + It.IsAny()), + Times.Once); + + // And substitution preserves order: A → 0.5, B → 0.25, C → 0.125 + Assert.AreEqual("[0.5]", resolvedParams["A"]); + Assert.AreEqual("[0.25]", resolvedParams["B"]); + Assert.AreEqual("[0.125]", resolvedParams["C"]); + } + + /// + /// When 3 embed params are configured but only 2 are supplied in the request, + /// the batch should contain only the supplied subset. The third param is left + /// for downstream handling (SQL-level required-param error). + /// + [TestMethod] + public async Task PartiallySuppliedEmbedParams_BatchesSubsetOnly() + { + Dictionary resolvedParams = new() + { + { "p1", "supplied 1" }, + { "p3", "supplied 3" } + // p2 is missing + }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2"), EmbedParam("p3") }; + SetupBatch(new[] { 0.5f }, new[] { 0.125f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 2 && arr[0] == "supplied 1" && arr[1] == "supplied 3"), + It.IsAny()), + Times.Once); + + Assert.AreEqual("[0.5]", resolvedParams["p1"]); + Assert.AreEqual("[0.125]", resolvedParams["p3"]); + Assert.IsFalse(resolvedParams.ContainsKey("p2")); + } + + #endregion + + #region Batch Result Handling + + /// + /// A successful batch result should produce vector JSON substitutions in resolvedParams + /// — happy-path coverage for the substitute phase. + /// + [TestMethod] + public async Task BatchSuccess_VectorsSubstitutedInResolvedParams() + { + Dictionary resolvedParams = new() { { "p1", "x" }, { "p2", "y" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + SetupBatch(new[] { 1.0f, 2.0f }, new[] { 3.0f, 4.0f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + Assert.AreEqual("[1,2]", resolvedParams["p1"]); + Assert.AreEqual("[3,4]", resolvedParams["p2"]); + } + + /// + /// A failed batch result (Success: false) should throw a 500 InternalServerError + /// listing all involved param names so the caller can identify request context. + /// + [TestMethod] + public async Task BatchFailure_Throws500_WithAllParamNames() + { + Dictionary resolvedParams = new() { { "p1", "a" }, { "p2", "b" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + _mockService + .Setup(s => s.TryEmbedBatchAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new EmbeddingBatchResult(false, null, "OpenAI rate limit")); + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.InternalServerError, ex.StatusCode); + StringAssert.Contains(ex.Message, "p1"); + StringAssert.Contains(ex.Message, "p2"); + } + + /// + /// A successful batch result with the wrong embedding count (service contract violation) + /// should throw a 500 InternalServerError mentioning the count mismatch, instead of + /// silently using the wrong vectors. + /// + [TestMethod] + public async Task BatchLengthMismatch_Throws500() + { + Dictionary resolvedParams = new() { { "p1", "a" }, { "p2", "b" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + // Service returns ONE embedding when TWO were requested + SetupBatch(new[] { 0.1f }); + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.InternalServerError, ex.StatusCode); + StringAssert.Contains(ex.Message, "1"); + StringAssert.Contains(ex.Message, "2"); + } + + /// + /// A batch with one valid and one empty embedding (e.g., empty array slot) should + /// throw a 500 naming the specific param whose embedding was empty. + /// + [TestMethod] + public async Task IndividualEmbeddingEmpty_Throws500_NamingFailedParam() + { + Dictionary resolvedParams = new() { { "p1", "a" }, { "p2", "b" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + // Second embedding is empty — defensive check should fire + SetupBatch(new[] { 0.1f }, System.Array.Empty()); + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.InternalServerError, ex.StatusCode); + StringAssert.Contains(ex.Message, "p2"); + } + + #endregion + + #region Output Format And Cancellation + + /// + /// The vector JSON output must use G9 float formatting and InvariantCulture to ensure + /// round-trippability and locale-independent decimal separators. This is critical for + /// embedding precision (G7 default would lose ~30% of values per Microsoft docs). + /// + /// Tests: + /// - No spaces between values + /// - Period (.) as decimal separator regardless of thread culture + /// - G9 precision preserved (a value that requires G9 to round-trip) + /// + [TestMethod] + public async Task VectorJson_UsesG9AndInvariantCulture() + { + Dictionary resolvedParams = new() { { "q", "any" } }; + List configParams = new() { EmbedParam("q") }; + + // 0.1f cannot be exactly represented in binary float; G9 preserves enough digits + // to round-trip. G7 (the "G" default) would give "0.1" which loses precision. + SetupBatch(new[] { 0.1f, -0.2f, 0.0001234567f }); + + // Save and switch culture to ensure InvariantCulture is enforced + CultureInfo originalCulture = CultureInfo.CurrentCulture; + try + { + // German locale uses comma as decimal separator — proves we're using InvariantCulture + CultureInfo.CurrentCulture = new CultureInfo("de-DE"); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + } + finally + { + CultureInfo.CurrentCulture = originalCulture; + } + + string vectorJson = (string)resolvedParams["q"]!; + + // No spaces + Assert.IsFalse(vectorJson.Contains(" "), "Vector JSON should not contain spaces"); + // Period decimal separator (not German comma) + Assert.IsTrue(vectorJson.Contains("0.1"), "Vector JSON should use period as decimal separator"); + // Bracketed + Assert.IsTrue(vectorJson.StartsWith("["), "Vector JSON should start with ["); + Assert.IsTrue(vectorJson.EndsWith("]"), "Vector JSON should end with ]"); + // Comma-separated values + Assert.AreEqual(3, vectorJson.Split(',').Length, "Vector JSON should have 3 comma-separated values"); + + // Round-trip the parsed values to confirm G9 precision + string trimmed = vectorJson.Substring(1, vectorJson.Length - 2); + string[] parts = trimmed.Split(','); + Assert.AreEqual(0.1f, float.Parse(parts[0], CultureInfo.InvariantCulture)); + Assert.AreEqual(-0.2f, float.Parse(parts[1], CultureInfo.InvariantCulture)); + Assert.AreEqual(0.0001234567f, float.Parse(parts[2], CultureInfo.InvariantCulture)); + } + + /// + /// The CancellationToken passed to the helper must be forwarded to the embedding + /// service's batch call so request cancellation propagates correctly. + /// + [TestMethod] + public async Task CancellationToken_ForwardedToEmbeddingService() + { + Dictionary resolvedParams = new() { { "q", "hello" } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.1f }); + + using CancellationTokenSource cts = new(); + CancellationToken token = cts.Token; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, token); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.IsAny(), + It.Is(ct => ct == token)), + Times.Once); + } + + #endregion +} From 38b9edcf728312fd28f640c98d4a14cc2b6b6941 Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Mon, 4 May 2026 15:57:11 -0700 Subject: [PATCH 10/11] Phase 3 Stage 4.3: Add validator + metadata override unit tests (16 tests) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds focused unit tests for two pieces of Phase 3 production code that were untested before Stage 4.1's refactors made them testable. Files modified -------------- 1. src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs (+10 tests) New `#region Embed Parameters Validation` block targeting `RuntimeConfigValidator.ValidateEmbedParameters` (now `internal` after Stage 4.1; accessible via `[InternalsVisibleTo]`). Test coverage by validation rule: Rule 0 — embed:true requires MSSQL data source (2 [DataRow]s) - Rule 0: embed:true on PostgreSQL → 503 - Rule 0: embed:true on MySQL → 503 Rule 1 — embed:true requires stored-procedure entity (2 [DataRow]s + 1) - ValidateEmbedParameters_EmbedOnStoredProcedure_NoError (happy path) - Rule 1: embed:true on Table → 503 - Rule 1: embed:true on View → 503 Rule 2 — embed:true requires embeddings configured (2 [DataRow]s) - Rule 2: embeddings.enabled=false → 503 - Rule 2: embeddings section missing → 503 Rule 3 — embed:true cannot have a default value (2 tests) - ValidateEmbedParameters_EmbedTrue_WithDefault_ThrowsConfigError - ValidateEmbedParameters_EmbedTrue_WithoutDefault_NoError Multi-entity message content (1 test) - ValidateEmbedParameters_MultipleEntities_OneViolates_NamesViolatingEntityAndParam (asserts the error names ONLY the offending entity/param, not the healthy ones) Helper methods (private static, scoped to the test class): - BuildEmbeddingsEnabled() — valid EmbeddingsOptions for happy path - BuildSprocEntity(...) — stored-procedure entity with one parameter - BuildEntityWithSourceType(...) — table/view/sproc entity for Rule 1 tests - BuildRuntimeConfigForEmbedTest(...) — assembles the runtime config Uses [DataTestMethod] + [DataRow] where test shapes repeat (matches the existing ValidateEmbeddingsOptions_BaseUrl pattern in this file). 2. src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs (+6 tests) New `#region Embed Type Override` block targeting `MsSqlMetadataProvider.ApplyEmbedTypeOverride` (the static helper extracted in Stage 4.1; `internal` accessible via `[InternalsVisibleTo]`). Test coverage: Type override behavior (4 tests) - ApplyEmbedTypeOverride_ByteArrayParam_WithEmbedTrue_OverridesToString - ApplyEmbedTypeOverride_StringParam_WithEmbedTrue_ThrowsAtStartup - ApplyEmbedTypeOverride_IntParam_WithEmbedTrue_ThrowsAtStartup - ApplyEmbedTypeOverride_ByteArrayParam_WithEmbedFalse_NoChange Edge cases (2 tests) - ApplyEmbedTypeOverride_ByteArrayWithRequiredAndDefault_OnlyTypeMetadataChanges (proves helper only mutates type fields, not Required/Default/etc.) - ApplyEmbedTypeOverride_MultipleParams_OnlyEmbedTrueOnesOverridden (multi-call scenario; proves no cross-call state) Implementation notes -------------------- - All 16 tests construct ParameterDefinition / ParameterMetadata / Entity / RuntimeConfig instances directly. No DB connection, no DI container, no full metadata-provider construction. Pure unit tests. - Validator-test helper deliberately leaves embeddings as null when not passed (NOT defaulted to enabled). Tests targeting Rule 2's "section missing" path can pass null literally; tests targeting other rules must pass BuildEmbeddingsEnabled() explicitly. - Order of rule checks (Rule 0 → 1 → 2 → 3) is reflected in the test configurations: tests targeting a specific rule satisfy all earlier rules so the targeted rule fires first. - StringAssert.Contains is used for error-message verification rather than full-string equality, to allow the validator's error messages to evolve without breaking tests. Test results ------------ - dotnet build src/Service.Tests/Azure.DataApiBuilder.Service.Tests.csproj -c Release → Build succeeded. 0 Warning(s). 0 Error(s). - dotnet test --filter "FullyQualifiedName~ApplyEmbedTypeOverride|FullyQualifiedName~ValidateEmbedParameters" → Total tests: 16. Passed: 16. Failed: 0. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../UnitTests/ConfigValidationUnitTests.cs | 327 ++++++++++++++++++ .../UnitTests/SqlMetadataProviderUnitTests.cs | 200 +++++++++++ 2 files changed, 527 insertions(+) diff --git a/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs b/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs index bbb4874d1a..77da4b7156 100644 --- a/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs +++ b/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs @@ -3579,6 +3579,333 @@ public void ValidateEmbeddingsOptions_EndpointDisabled_SkipsValidation() configValidator.ValidateEmbeddingsOptions(runtimeConfig); } + #region Embed Parameters Validation + + // ───────────────────────────────────────────────────────────────── + // Tests for RuntimeConfigValidator.ValidateEmbedParameters (Phase 3) + // Covers the four startup rules: + // Rule 0 — embed:true requires MSSQL data source + // Rule 1 — embed:true requires stored-procedure entity + // Rule 2 — embed:true requires runtime.embeddings configured & enabled + // Rule 3 — embed:true cannot be combined with a `default` value + // Plus multi-entity / message-content checks. + // ───────────────────────────────────────────────────────────────── + + /// + /// Builds a valid instance with embeddings enabled. + /// Used to satisfy Rule 2 in tests that target other rules. + /// + private static EmbeddingsOptions BuildEmbeddingsEnabled() => + new( + Provider: EmbeddingProviderType.AzureOpenAI, + BaseUrl: "https://test.openai.azure.com", + ApiKey: "test-key", + Enabled: true, + Model: "text-embedding-ada-002"); + + /// + /// Builds a stored-procedure with one parameter whose + /// Embed flag and Default value are caller-controlled. + /// + private static Entity BuildSprocEntity( + string sprocObject, + string paramName, + bool embed, + string defaultValue = null) + { + return new( + Source: new( + Type: EntitySourceType.StoredProcedure, + Object: sprocObject, + Parameters: new List + { + new() { Name = paramName, Embed = embed, Default = defaultValue } + }, + KeyFields: null), + Fields: null, + Rest: new(EntityRestOptions.DEFAULT_HTTP_VERBS_ENABLED_FOR_SP), + GraphQL: new($"{paramName}_singular", $"{paramName}_plural"), + Permissions: new EntityPermission[] + { + new( + Role: "anonymous", + Actions: new[] { new EntityAction(EntityActionOperation.Execute, null, new()) }) + }, + Relationships: null, + Mappings: null); + } + + /// + /// Builds an entity of the given source type (Table / View / StoredProcedure) + /// with one embed:true parameter. Used to test that embed:true is rejected + /// on non-stored-procedure entities (Rule 1). + /// + private static Entity BuildEntityWithSourceType( + EntitySourceType sourceType, + string objectName, + string paramName) + { + return new( + Source: new( + Type: sourceType, + Object: objectName, + Parameters: new List + { + new() { Name = paramName, Embed = true } + }, + KeyFields: null), + Fields: null, + Rest: new(), + GraphQL: new("entity", "entities"), + Permissions: new EntityPermission[] + { + new( + Role: "anonymous", + Actions: new[] { new EntityAction(EntityActionOperation.Read, null, new()) }) + }, + Relationships: null, + Mappings: null); + } + + /// + /// Builds a wrapping the given entities, with + /// caller-controlled data source type and embeddings configuration. + /// + /// Passing = null literally produces a config with + /// runtime.embeddings = null (the "section missing" scenario for Rule 2). Tests + /// that want the happy-path embeddings should pass BuildEmbeddingsEnabled(). + /// + private static RuntimeConfig BuildRuntimeConfigForEmbedTest( + Dictionary entities, + DatabaseType databaseType = DatabaseType.MSSQL, + EmbeddingsOptions embeddings = null) + { + return new( + Schema: "UnitTestSchema", + DataSource: new DataSource(databaseType, "", Options: null), + Runtime: new( + Rest: new(), + GraphQL: new(), + Mcp: new(), + Host: new(null, null), + Embeddings: embeddings), + Entities: new(entities)); + } + + /// + /// Rule 1: embed:true on a stored-procedure entity (with all other rules satisfied) + /// is the happy path. The validator should NOT throw and should record no + /// validation exceptions. + /// + [TestMethod] + public void ValidateEmbedParameters_EmbedOnStoredProcedure_NoError() + { + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + // Should not throw + validator.ValidateEmbedParameters(config); + + // And no validation exceptions should have been recorded. + // (Defensive: in default mode HandleOrRecordException throws rather than + // appending, so the absence of a throw above already guarantees Count == 0. + // This explicit check documents the success criterion and would catch a + // future change that switched to record-only mode without our notice.) + Assert.AreEqual(0, validator.ConfigValidationExceptions.Count, + "Expected no validation exceptions for the happy path"); + + // Sanity: the entity under test is wired correctly so a future refactor + // can't make this test pass by accidentally short-circuiting. + Assert.AreEqual(EntitySourceType.StoredProcedure, config.Entities["SearchProducts"].Source.Type); + Assert.IsTrue(config.Entities["SearchProducts"].Source.Parameters!.Any(p => p.Embed), + "Test setup expected at least one embed:true parameter"); + } + + /// + /// Rule 1: embed:true on a Table or View entity must throw a config-validation + /// exception naming the offending entity, parameter, and source-type constraint. + /// + [DataTestMethod] + [DataRow(EntitySourceType.Table, DisplayName = "Rule 1: embed:true on Table → 503")] + [DataRow(EntitySourceType.View, DisplayName = "Rule 1: embed:true on View → 503")] + public void ValidateEmbedParameters_EmbedOnNonSproc_ThrowsConfigError(EntitySourceType sourceType) + { + Dictionary entities = new() + { + { "Product", BuildEntityWithSourceType(sourceType, "products", "q") } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "Product"); + StringAssert.Contains(ex.Message, "q"); + StringAssert.Contains(ex.Message, "stored-procedure"); + } + + /// + /// Rule 2: embed:true with embeddings disabled (or absent) must throw a + /// config-validation exception. Two flavors: section present but enabled=false, + /// and section completely absent (null). + /// + [DataTestMethod] + [DataRow(false, DisplayName = "Rule 2: embeddings.enabled=false → 503")] + [DataRow(true, DisplayName = "Rule 2: embeddings section missing → 503")] + public void ValidateEmbedParameters_EmbedTrue_EmbeddingsNotConfigured_ThrowsConfigError(bool nullEmbeddings) + { + EmbeddingsOptions embeddings = nullEmbeddings + ? null + : new EmbeddingsOptions( + Provider: EmbeddingProviderType.OpenAI, + BaseUrl: "https://api.openai.com", + ApiKey: "key", + Enabled: false); + + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: embeddings); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "SearchProducts"); + StringAssert.Contains(ex.Message, "q"); + StringAssert.Contains(ex.Message, "runtime.embeddings"); + } + + /// + /// Rule 3: embed:true combined with a default value is not supported. + /// Should throw a config-validation exception even though all other rules pass. + /// + [TestMethod] + public void ValidateEmbedParameters_EmbedTrue_WithDefault_ThrowsConfigError() + { + Dictionary entities = new() + { + { + "SearchProducts", + BuildSprocEntity("sp_search", "q", embed: true, defaultValue: "wireless headphones") + } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "SearchProducts"); + StringAssert.Contains(ex.Message, "q"); + StringAssert.Contains(ex.Message, "default"); + } + + /// + /// Rule 3 happy path: embed:true with no default value is allowed. The validator + /// should NOT throw and should record no validation exceptions. + /// + [TestMethod] + public void ValidateEmbedParameters_EmbedTrue_WithoutDefault_NoError() + { + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true, defaultValue: null) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + // Should not throw + validator.ValidateEmbedParameters(config); + + // And no validation exceptions should have been recorded. + Assert.AreEqual(0, validator.ConfigValidationExceptions.Count, + "Expected no validation exceptions when embed:true is paired with no default"); + + // Sanity: the embed param under test was set up with no default, as intended. + ParameterMetadata embedParam = config.Entities["SearchProducts"].Source.Parameters! + .Single(p => p.Embed); + Assert.IsNull(embedParam.Default, "Test setup expected the embed param to have no default"); + } + + /// + /// Rule 0: embed:true is only supported on MSSQL data sources. The validator + /// should reject embed:true on PostgreSQL, MySQL, DWSQL, or CosmosDB. + /// + [DataTestMethod] + [DataRow(DatabaseType.PostgreSQL, DisplayName = "Rule 0: embed:true on PostgreSQL → 503")] + [DataRow(DatabaseType.MySQL, DisplayName = "Rule 0: embed:true on MySQL → 503")] + public void ValidateEmbedParameters_EmbedTrue_NonMssqlDataSource_ThrowsConfigError(DatabaseType dbType) + { + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, databaseType: dbType, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, dbType.ToString()); + StringAssert.Contains(ex.Message, "Azure SQL"); + } + + /// + /// When multiple entities exist and ONE has an embed-rule violation, the + /// thrown exception must clearly identify the offending entity and parameter + /// (so the user can fix their config without grep-and-guess). + /// + [TestMethod] + public void ValidateEmbedParameters_MultipleEntities_OneViolates_NamesViolatingEntityAndParam() + { + Dictionary entities = new() + { + // Healthy entities + { "OkProducts", BuildSprocEntity("sp_ok", "q", embed: true) }, + { "AlsoOk", BuildSprocEntity("sp_other", "input", embed: true) }, + // The violator: embed:true on a table + { "BadEntity", BuildEntityWithSourceType(EntitySourceType.Table, "bad_table", "queryParam") } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + // Must name the offending entity and param specifically + StringAssert.Contains(ex.Message, "BadEntity"); + StringAssert.Contains(ex.Message, "queryParam"); + // Must not name the healthy ones + Assert.IsFalse(ex.Message.Contains("OkProducts"), + "Error should not mention healthy entities; was: " + ex.Message); + Assert.IsFalse(ex.Message.Contains("AlsoOk"), + "Error should not mention healthy entities; was: " + ex.Message); + } + + #endregion + private static RuntimeConfigValidator InitializeRuntimeConfigValidator() { MockFileSystem fileSystem = new(); diff --git a/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs b/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs index dd6ad7d27e..d15985d5dd 100644 --- a/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs +++ b/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs @@ -670,5 +670,205 @@ public async Task CheckAutoentitiesQuery(string[] include, string[] exclude, str TestHelper.UnsetAllDABEnvironmentVariables(); } + + #region Embed Type Override + + // ───────────────────────────────────────────────────────────────── + // Tests for MsSqlMetadataProvider.ApplyEmbedTypeOverride (Phase 3) + // + // The helper is invoked at startup for each parameter defined in + // config for a stored procedure. When embed:true is set, it: + // - validates the parameter is VECTOR-shaped (SystemType == byte[]) + // - overrides SystemType/DbType/SqlDbType to flow as String + // + // These tests construct ParameterDefinition / ParameterMetadata + // pairs directly and invoke the helper — no DB, no DI container, + // no full metadata-provider construction. + // ───────────────────────────────────────────────────────────────── + + /// + /// embed:true on a parameter that SQL Server reports as Byte[] (i.e., VECTOR(N) + /// surfaced via INFORMATION_SCHEMA.PARAMETERS as varbinary) overrides the + /// metadata type so the vector JSON string flows through DAB's String pipeline. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_ByteArrayParam_WithEmbedTrue_OverridesToString() + { + ParameterDefinition def = new() + { + Name = "query_vector", + SystemType = typeof(byte[]) + }; + ParameterMetadata meta = new() { Name = "query_vector", Embed = true }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_search", parameterName: "query_vector"); + + Assert.AreEqual(typeof(string), def.SystemType); + Assert.AreEqual(System.Data.DbType.String, def.DbType); + Assert.AreEqual(System.Data.SqlDbType.NVarChar, def.SqlDbType); + } + + /// + /// embed:true on a parameter that the SQL Server reports as something other than + /// Byte[] — e.g., a real string param — must throw at startup with a 503 and + /// a clear message naming the schema/sproc/parameter and saying VECTOR(N) is required. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_StringParam_WithEmbedTrue_ThrowsAtStartup() + { + ParameterDefinition def = new() + { + Name = "query_text", + SystemType = typeof(string) + }; + ParameterMetadata meta = new() { Name = "query_text", Embed = true }; + + DataApiBuilderException ex = Assert.ThrowsException(() => + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_test", parameterName: "query_text")); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ErrorInInitialization, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "dbo.sp_test"); + StringAssert.Contains(ex.Message, "query_text"); + StringAssert.Contains(ex.Message, "String"); + StringAssert.Contains(ex.Message, "VECTOR"); + } + + /// + /// embed:true on a numeric parameter (e.g., INT) is also rejected with the same + /// 503. Catches a common misconfiguration where embed is applied to a count param. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_IntParam_WithEmbedTrue_ThrowsAtStartup() + { + ParameterDefinition def = new() + { + Name = "top_k", + SystemType = typeof(int) + }; + ParameterMetadata meta = new() { Name = "top_k", Embed = true }; + + DataApiBuilderException ex = Assert.ThrowsException(() => + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_top", parameterName: "top_k")); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ErrorInInitialization, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "Int32"); + StringAssert.Contains(ex.Message, "VECTOR"); + } + + /// + /// embed:false on a Byte[] parameter (e.g., a real varbinary blob like an image) + /// must NOT be touched — the override only fires for embed:true params. The + /// parameter's metadata should be returned exactly as it came in. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_ByteArrayParam_WithEmbedFalse_NoChange() + { + ParameterDefinition def = new() + { + Name = "image_blob", + SystemType = typeof(byte[]), + DbType = System.Data.DbType.Binary, + SqlDbType = System.Data.SqlDbType.VarBinary + }; + ParameterMetadata meta = new() { Name = "image_blob", Embed = false }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_upload", parameterName: "image_blob"); + + // Original type metadata preserved exactly + Assert.AreEqual(typeof(byte[]), def.SystemType); + Assert.AreEqual(System.Data.DbType.Binary, def.DbType); + Assert.AreEqual(System.Data.SqlDbType.VarBinary, def.SqlDbType); + } + + /// + /// Edge case: embed:true behavior is independent of + /// and . The override fires purely on + /// + being byte[]. Other + /// metadata fields are not affected. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_ByteArrayWithRequiredAndDefault_OnlyTypeMetadataChanges() + { + ParameterDefinition def = new() + { + Name = "v", + SystemType = typeof(byte[]), + Required = true, + Default = "should-be-untouched-by-override", + HasConfigDefault = true, + ConfigDefaultValue = "should-be-untouched-by-override" + }; + ParameterMetadata meta = new() { Name = "v", Embed = true }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_v", parameterName: "v"); + + // Type metadata overridden + Assert.AreEqual(typeof(string), def.SystemType); + Assert.AreEqual(System.Data.DbType.String, def.DbType); + Assert.AreEqual(System.Data.SqlDbType.NVarChar, def.SqlDbType); + + // Other fields untouched (note: validator separately rejects embed:true + default + // before request execution; this test confirms the metadata helper itself is + // narrow-scoped and doesn't mutate fields outside its concern). + Assert.AreEqual(true, def.Required); + Assert.AreEqual("should-be-untouched-by-override", def.Default); + Assert.AreEqual(true, def.HasConfigDefault); + Assert.AreEqual("should-be-untouched-by-override", def.ConfigDefaultValue); + } + + /// + /// Multi-parameter scenario: in a real metadata-loading loop, the helper is + /// called once per parameter. Calling it on a mix of embed:true Byte[] params + /// (overridden) and embed:false params (untouched) confirms the helper has no + /// cross-call state. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_MultipleParams_OnlyEmbedTrueOnesOverridden() + { + ParameterDefinition embedDef = new() + { + Name = "primary_query", + SystemType = typeof(byte[]) + }; + ParameterDefinition normalDef = new() + { + Name = "top_k", + SystemType = typeof(int) + }; + ParameterDefinition anotherEmbedDef = new() + { + Name = "secondary_query", + SystemType = typeof(byte[]) + }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + embedDef, + new ParameterMetadata { Name = "primary_query", Embed = true }, + schemaName: "dbo", storedProcedureName: "sp_hybrid", parameterName: "primary_query"); + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + normalDef, + new ParameterMetadata { Name = "top_k", Embed = false }, + schemaName: "dbo", storedProcedureName: "sp_hybrid", parameterName: "top_k"); + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + anotherEmbedDef, + new ParameterMetadata { Name = "secondary_query", Embed = true }, + schemaName: "dbo", storedProcedureName: "sp_hybrid", parameterName: "secondary_query"); + + // Both embed:true params overridden to String pipeline + Assert.AreEqual(typeof(string), embedDef.SystemType); + Assert.AreEqual(typeof(string), anotherEmbedDef.SystemType); + + // Non-embed param's type unchanged + Assert.AreEqual(typeof(int), normalDef.SystemType); + } + + #endregion } } From ca734a28785b1d11ba9a45c2c0f7c26210e5ed41 Mon Sep 17 00:00:00 2001 From: Pratik Shrivastav Date: Fri, 15 May 2026 14:13:28 -0700 Subject: [PATCH 11/11] Phase 3 Stage 3.10: Address PR review feedback from ajtiwari07 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two minor improvements to RuntimeConfigValidator.ValidateEmbedParameters based on review comments on PR #1. 1. Add entity-level fast-path short-circuit (line ~452) Skip entities whose parameters list contains no embed:true entry, before doing the data-source lookup and entering the inner param loop. Avoids GetDataSourceFromEntityName() and the inner foreach for the common case of entities whose params are all normal pass-through. Before: foreach entity: if Parameters is null: continue lookup data source (work) foreach param: if !Embed: continue (work, repeated per-param) ... rules ... After: foreach entity: if Parameters is null: continue if !Parameters.Any(p => p.Embed): continue (NEW fast-path) lookup data source foreach param: if !Embed: continue ... rules ... The inner !Embed continue is left in place so Rule fields (param.Name etc.) are still scoped per-param when rules fire. 2. Add TODO comment near the MSSQL-only check (line ~477) One-line // TODO: comment noting that PostgreSQL/MySQL could be supported once their metadata providers grow embed-aware type-override logic. Documents the intentional scope boundary for future contributors. Behavior preserved ------------------ - All 4 validation rules still fire identically when embed:true params are present. - Skipped entities (no embed:true params) produce the same observable behavior — the inner loop's per-param continue would have skipped them anyway. The change is pure performance; no rules are bypassed. Verification ------------ - dotnet build src/Core/Azure.DataApiBuilder.Core.csproj -c Release → Build succeeded. 0 Warning(s). 0 Error(s). - All 44 Phase 3 unit tests pass: dotnet test --filter "FullyQualifiedName~ParameterEmbeddingHelperTests |FullyQualifiedName~ValidateEmbedParameters |FullyQualifiedName~ApplyEmbedTypeOverride" → Passed: 44, Failed: 0. - The 10 ValidateEmbedParameters tests in particular still cover all 4 rules; the fast-path change doesn't bypass any of them when an embed:true param is present. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Core/Configurations/RuntimeConfigValidator.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index 8fcdb5fd47..ec44f18dd6 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -453,6 +453,14 @@ internal void ValidateEmbedParameters(RuntimeConfig runtimeConfig) continue; } + // Fast-path: skip entities with no embed:true parameters entirely. + // Avoids the data-source lookup and inner loop for the common case of + // entities whose params are all normal pass-through. + if (!entity.Source.Parameters.Any(p => p.Embed)) + { + continue; + } + // Hoist data source lookup outside the param loop — it's entity-scoped, not param-scoped. // Looked up once per entity instead of once per parameter (was duplicated work in Stage 3.5). DataSource entityDataSource = runtimeConfig.GetDataSourceFromEntityName(entityName); @@ -474,6 +482,7 @@ internal void ValidateEmbedParameters(RuntimeConfig runtimeConfig) // For PostgreSQL/MySQL/Cosmos, the request would fail at runtime with a confusing // type error. Reject at startup instead. // Example FAIL: PostgreSQL entity with embed:true → "embed feature only supported for MSSQL" + // TODO: Extend to PostgreSQL/MySQL once their metadata providers grow embed-aware type-override logic. if (entityDataSource.DatabaseType != DatabaseType.MSSQL) { HandleOrRecordException(new DataApiBuilderException(