diff --git a/schemas/dab.draft.schema.json b/schemas/dab.draft.schema.json index 41c62d4f90..c4b9ce9c89 100644 --- a/schemas/dab.draft.schema.json +++ b/schemas/dab.draft.schema.json @@ -1245,7 +1245,8 @@ "name": { "type": "string", "description": "Parameter name" }, "required": { "$ref": "#/$defs/boolean-or-string", "description": "Is parameter required" }, "default": { "type": ["string", "number", "boolean", "null"], "description": "Default value" }, - "description": { "type": "string", "description": "Parameter description. Since descriptions for multiple parameters are provided as a comma-separated string, individual parameter descriptions must not contain a comma (',')." } + "description": { "type": "string", "description": "Parameter description. Since descriptions for multiple parameters are provided as a comma-separated string, individual parameter descriptions must not contain a comma (',')." }, + "embed": { "type": "boolean", "description": "When true, the parameter text is automatically converted to an embedding vector via the configured embedding service before being passed to the stored procedure. Requires runtime.embeddings to be configured. Only valid on stored-procedure entities. The target stored procedure parameter must be declared as VECTOR(N) — DAB cannot detect non-VECTOR misconfigurations at startup due to SQL Server metadata limitations.", "default": false } } } } diff --git a/src/Config/ObjectModel/ParameterMetadata.cs b/src/Config/ObjectModel/ParameterMetadata.cs index 334979d728..f0107e087a 100644 --- a/src/Config/ObjectModel/ParameterMetadata.cs +++ b/src/Config/ObjectModel/ParameterMetadata.cs @@ -24,5 +24,19 @@ public class ParameterMetadata /// Gets or sets the default value of the parameter, if any. /// public string? Default { get; set; } + + /// + /// When true, the parameter value (text) is automatically embedded via the + /// EmbeddingService and the resulting vector is passed to the stored procedure. + /// Only valid on stored-procedure entities when runtime.embeddings is configured. + /// + /// IMPORTANT: The target stored procedure parameter must be declared as VECTOR(N). + /// SQL Server's metadata system reports VECTOR(N) and varbinary indistinguishably, + /// so DAB cannot detect this misconfiguration at startup. If embed:true is applied + /// to a non-VECTOR parameter (e.g., NVARCHAR or VARBINARY), the request will fail + /// at runtime with a SQL error or return semantically incorrect results. + /// It is the developer's responsibility to ensure the sproc parameter is VECTOR(N). + /// + public bool Embed { get; set; } } } diff --git a/src/Core/Azure.DataApiBuilder.Core.csproj b/src/Core/Azure.DataApiBuilder.Core.csproj index 8b32a8682c..d4990a9cbc 100644 --- a/src/Core/Azure.DataApiBuilder.Core.csproj +++ b/src/Core/Azure.DataApiBuilder.Core.csproj @@ -58,6 +58,10 @@ + + + + diff --git a/src/Core/Configurations/RuntimeConfigValidator.cs b/src/Core/Configurations/RuntimeConfigValidator.cs index 32b2ff93b6..ec44f18dd6 100644 --- a/src/Core/Configurations/RuntimeConfigValidator.cs +++ b/src/Core/Configurations/RuntimeConfigValidator.cs @@ -98,6 +98,7 @@ public void ValidateConfigProperties() ValidateAzureLogAnalyticsAuth(runtimeConfig); ValidateFileSinkPath(runtimeConfig); ValidateEmbeddingsOptions(runtimeConfig); + ValidateEmbedParameters(runtimeConfig); } /// @@ -421,7 +422,154 @@ public void ValidateEmbeddingsOptions(RuntimeConfig runtimeConfig) subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); } } + } + + /// + /// Validates that parameters with embed=true are only used on stored-procedure entities + /// and that runtime.embeddings is configured when embed parameters are present. + /// + /// + /// Internal (rather than private) to allow direct unit testing via the + /// InternalsVisibleTo attribute on Azure.DataApiBuilder.Core. Callers outside + /// the assembly should still go through . + /// + internal void ValidateEmbedParameters(RuntimeConfig runtimeConfig) + { + // Check once whether the embedding service is configured and enabled. + // Example: "runtime": { "embeddings": { "enabled": true, "provider": "azure-openai" } } → true + // Example: embeddings section missing or "enabled": false → false + bool embeddingsConfigured = runtimeConfig.Runtime?.Embeddings is not null + && runtimeConfig.Runtime.Embeddings.Enabled; + + // Loop through every entity in the config. + // Example entities: "Product" (table), "Category" (table), "SearchProducts" (sproc) + foreach ((string entityName, Entity entity) in runtimeConfig.Entities) + { + // Skip entities that have no parameters defined. + // Tables and views typically don't have parameters. + // Example: "Product": { "source": { "type": "table" } } → Parameters is null → skip + if (entity.Source.Parameters is null) + { + continue; + } + // Fast-path: skip entities with no embed:true parameters entirely. + // Avoids the data-source lookup and inner loop for the common case of + // entities whose params are all normal pass-through. + if (!entity.Source.Parameters.Any(p => p.Embed)) + { + continue; + } + + // Hoist data source lookup outside the param loop — it's entity-scoped, not param-scoped. + // Looked up once per entity instead of once per parameter (was duplicated work in Stage 3.5). + DataSource entityDataSource = runtimeConfig.GetDataSourceFromEntityName(entityName); + + // Check each parameter for the embed flag. + // Example: iterates over { "name": "query_vector", "embed": true } and { "name": "top_k", "default": "5" } + foreach (ParameterMetadata param in entity.Source.Parameters) + { + // Skip parameters that don't have embed: true. Most params are normal pass-through. + // Example: "top_k" has Embed=false (default) → skip + // Example: "query_vector" has Embed=true → continue to validation checks + if (!param.Embed) + { + continue; + } + + // Rule 0: embed:true is only supported on Azure SQL / SQL Server data sources. + // The metadata type override (Byte[] → String) only exists in MsSqlMetadataProvider. + // For PostgreSQL/MySQL/Cosmos, the request would fail at runtime with a confusing + // type error. Reject at startup instead. + // Example FAIL: PostgreSQL entity with embed:true → "embed feature only supported for MSSQL" + // TODO: Extend to PostgreSQL/MySQL once their metadata providers grow embed-aware type-override logic. + if (entityDataSource.DatabaseType != DatabaseType.MSSQL) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but the data source type is '{entityDataSource.DatabaseType}'. The embed feature is currently only supported for Azure SQL / SQL Server.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + + // Rule 1: embed:true is only valid on stored-procedure entities. + // Tables/views don't have user-supplied parameters that get passed to SQL. + // Example PASS: "SearchProducts": { "source": { "type": "stored-procedure" } } + // Example FAIL: "Product": { "source": { "type": "table", "parameters": [{"name":"x","embed":true}] } } + // → Error: "Entity 'Product': parameter 'x' has 'embed: true' but is only valid on stored-procedure entities." + if (entity.Source.Type is not EntitySourceType.StoredProcedure) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but is only valid on stored-procedure entities.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + + // Rule 2: embed:true requires runtime.embeddings to be configured and enabled. + // Can't convert text to vectors without an embedding service. + // Example PASS: "embeddings": { "enabled": true, "provider": "azure-openai", "api-key": "..." } + // Example FAIL: "embeddings": { "enabled": false } or embeddings section missing + // → Error: "parameter 'query_vector' has 'embed: true' but runtime.embeddings is not configured or not enabled." + if (!embeddingsConfigured) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has 'embed: true' but runtime.embeddings is not configured or not enabled.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + + // Rule 3: embed:true with a default value is not supported. + // + // An embed parameter represents the user's text input that gets converted + // to a vector at request time — typically a semantic-search query. + // + // Setting a default for an embed parameter would mean: if the client doesn't + // supply a search query, the server invents one (e.g., "wireless headphones"), + // embeds it, and runs a semantic search the user never asked for. That isn't + // a fallback — it's the server fabricating user input. In any real UX, a + // missing search query indicates a client bug or an empty search box, not an + // invitation for the server to substitute a canned query on the user's behalf. + // + // (Defaults on non-embed parameters of the same sproc are unaffected by this + // rule and continue to work as before.) + // + // Even setting aside the UX concern, supporting embed-defaults would be + // non-trivial: + // - GraphQL schema defaults are baked in at startup as typed literals + // (GraphQLStoredProcedureBuilder.ConvertValueToGraphQLType). There is no + // VECTOR literal type in GraphQL, and the literal text would surface in + // introspection as a misleading default value for an embedded parameter. + // - REST/MCP defaults are injected as plain text into the resolved-parameter + // dictionary, then would be re-embedded by ParameterEmbeddingHelper on + // every request — a hidden per-request cost for a value the client never + // sent. + // - Embedding the default once at startup would couple application startup + // to the embedding provider's network availability (validation runs in + // CLI / startup contexts that may not have outbound access). + // + // What happens today if a client forgets to supply an embed parameter: + // - {"query_vector": null} or "" → 400 BadRequest "has 'embed: true' but + // the provided text is empty or whitespace." (caught by ParameterEmbeddingHelper) + // - field omitted entirely → 400 DatabaseInputError "expects parameter + // '@query_vector', which was not supplied." (SQL Server error, parsed + // by MsSqlDbExceptionParser) + // Both produce a clear, actionable client error — no silent failure. + // + // If a real use case for embed-defaults ever emerges, this rule can be lifted + // with the matching runtime support added. For now, embed parameters should + // always be supplied by the client. + // + // Example PASS: { "name": "query_vector", "embed": true } (no default) + // Example FAIL: { "name": "query_vector", "embed": true, "default": "wireless headphones" } + // → Error: "parameter 'query_vector' has both 'embed: true' and a 'default' value. Embed parameters cannot have default values." + if (param.Default is not null) + { + HandleOrRecordException(new DataApiBuilderException( + message: $"Entity '{entityName}': parameter '{param.Name}' has both 'embed: true' and a 'default' value. Embed parameters cannot have default values.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ConfigValidationError)); + } + } + } } /// diff --git a/src/Core/Resolvers/Factories/MutationEngineFactory.cs b/src/Core/Resolvers/Factories/MutationEngineFactory.cs index 9b4df9f8bf..f335ba87eb 100644 --- a/src/Core/Resolvers/Factories/MutationEngineFactory.cs +++ b/src/Core/Resolvers/Factories/MutationEngineFactory.cs @@ -8,6 +8,7 @@ using Azure.DataApiBuilder.Core.Configurations; using Azure.DataApiBuilder.Core.Models; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Microsoft.AspNetCore.Http; using static Azure.DataApiBuilder.Config.DabConfigEvents; @@ -29,18 +30,11 @@ public class MutationEngineFactory : IMutationEngineFactory private readonly IHttpContextAccessor _httpContextAccessor; private readonly IAuthorizationResolver _authorizationResolver; private readonly GQLFilterParser _gQLFilterParser; + private readonly IEmbeddingService? _embeddingService; /// /// Initializes a new instance of the class. /// - /// runtimeConfigProvider. - /// queryManagerFactory - /// metadataProviderFactory. - /// cosmosClientProvider - /// queryEngineFactory. - /// httpContextAccessor. - /// authorizationResolver. - /// GqlFilterParser. public MutationEngineFactory(RuntimeConfigProvider runtimeConfigProvider, IAbstractQueryManagerFactory queryManagerFactory, IMetadataProviderFactory metadataProviderFactory, @@ -49,7 +43,8 @@ public MutationEngineFactory(RuntimeConfigProvider runtimeConfigProvider, IHttpContextAccessor httpContextAccessor, IAuthorizationResolver authorizationResolver, GQLFilterParser gQLFilterParser, - HotReloadEventHandler? handler) + HotReloadEventHandler? handler, + IEmbeddingService? embeddingService = null) { handler?.Subscribe(MUTATION_ENGINE_FACTORY_ON_CONFIG_CHANGED, OnConfigChanged); @@ -61,6 +56,7 @@ public MutationEngineFactory(RuntimeConfigProvider runtimeConfigProvider, _queryEngineFactory = queryEngineFactory; _runtimeConfigProvider = runtimeConfigProvider; _gQLFilterParser = gQLFilterParser; + _embeddingService = embeddingService; _mutationEngines = new Dictionary(); ConfigureMutationEngines(); } @@ -78,7 +74,8 @@ private void ConfigureMutationEngines() _authorizationResolver, _gQLFilterParser, _httpContextAccessor, - _runtimeConfigProvider); + _runtimeConfigProvider, + _embeddingService); _mutationEngines.Add(DatabaseType.MySQL, mutationEngine); _mutationEngines.Add(DatabaseType.MSSQL, mutationEngine); _mutationEngines.Add(DatabaseType.PostgreSQL, mutationEngine); diff --git a/src/Core/Resolvers/Factories/QueryEngineFactory.cs b/src/Core/Resolvers/Factories/QueryEngineFactory.cs index 1d2ae2935d..9ae91015ad 100644 --- a/src/Core/Resolvers/Factories/QueryEngineFactory.cs +++ b/src/Core/Resolvers/Factories/QueryEngineFactory.cs @@ -9,6 +9,7 @@ using Azure.DataApiBuilder.Core.Models; using Azure.DataApiBuilder.Core.Services.Cache; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Microsoft.AspNetCore.Http; using Microsoft.Extensions.Logging; @@ -33,6 +34,7 @@ public class QueryEngineFactory : IQueryEngineFactory private readonly GQLFilterParser _gQLFilterParser; private readonly DabCacheService _cache; private readonly ILogger _logger; + private readonly IEmbeddingService? _embeddingService; /// public QueryEngineFactory(RuntimeConfigProvider runtimeConfigProvider, @@ -44,7 +46,8 @@ public QueryEngineFactory(RuntimeConfigProvider runtimeConfigProvider, GQLFilterParser gQLFilterParser, ILogger logger, DabCacheService cache, - HotReloadEventHandler? handler) + HotReloadEventHandler? handler, + IEmbeddingService? embeddingService = null) { handler?.Subscribe(QUERY_ENGINE_FACTORY_ON_CONFIG_CHANGED, OnConfigChanged); _queryEngines = new Dictionary(); @@ -57,6 +60,7 @@ public QueryEngineFactory(RuntimeConfigProvider runtimeConfigProvider, _gQLFilterParser = gQLFilterParser; _cache = cache; _logger = logger; + _embeddingService = embeddingService; ConfigureQueryEngines(); } @@ -75,7 +79,8 @@ public void ConfigureQueryEngines() _gQLFilterParser, _logger, _runtimeConfigProvider, - _cache); + _cache, + _embeddingService); _queryEngines.Add(DatabaseType.MSSQL, queryEngine); _queryEngines.Add(DatabaseType.MySQL, queryEngine); _queryEngines.Add(DatabaseType.PostgreSQL, queryEngine); diff --git a/src/Core/Resolvers/SqlMutationEngine.cs b/src/Core/Resolvers/SqlMutationEngine.cs index 204d71d44f..75e43709b3 100644 --- a/src/Core/Resolvers/SqlMutationEngine.cs +++ b/src/Core/Resolvers/SqlMutationEngine.cs @@ -16,6 +16,7 @@ using Azure.DataApiBuilder.Core.Resolvers.Sql_Query_Structures; using Azure.DataApiBuilder.Core.Services; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Azure.DataApiBuilder.Service.GraphQLBuilder; using Azure.DataApiBuilder.Service.GraphQLBuilder.Mutations; @@ -41,6 +42,7 @@ public class SqlMutationEngine : IMutationEngine private readonly IHttpContextAccessor _httpContextAccessor; private readonly GQLFilterParser _gQLFilterParser; private readonly RuntimeConfigProvider _runtimeConfigProvider; + private readonly IEmbeddingService? _embeddingService; public const string IS_UPDATE_RESULT_SET = "IsUpdateResultSet"; private const string TRANSACTION_EXCEPTION_ERROR_MSG = "An unexpected error occurred during the transaction execution"; public const string SINGLE_INPUT_ARGUEMENT_NAME = "item"; @@ -60,7 +62,8 @@ public SqlMutationEngine( IAuthorizationResolver authorizationResolver, GQLFilterParser gQLFilterParser, IHttpContextAccessor httpContextAccessor, - RuntimeConfigProvider runtimeConfigProvider) + RuntimeConfigProvider runtimeConfigProvider, + IEmbeddingService? embeddingService = null) { _queryManagerFactory = queryManagerFactory; _sqlMetadataProviderFactory = sqlMetadataProviderFactory; @@ -69,6 +72,7 @@ public SqlMutationEngine( _httpContextAccessor = httpContextAccessor; _gQLFilterParser = gQLFilterParser; _runtimeConfigProvider = runtimeConfigProvider; + _embeddingService = embeddingService; } /// @@ -347,6 +351,16 @@ private static bool IsPointMutation(IMiddlewareContext context) ISqlMetadataProvider sqlMetadataProvider = _sqlMetadataProviderFactory.GetMetadataProvider(dataSourceName); IQueryBuilder queryBuilder = _queryManagerFactory.GetQueryBuilder(sqlMetadataProvider.GetDatabaseType()); IQueryExecutor queryExecutor = _queryManagerFactory.GetQueryExecutor(sqlMetadataProvider.GetDatabaseType()); + + // Phase 3: substitute embed:true parameters with embedding vectors (REST mutation path) + // Helper handles the case where embed params exist but service is null (throws 503). + Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + context.ResolvedParameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); + SqlExecuteStructure executeQueryStructure = new( context.EntityName, sqlMetadataProvider, diff --git a/src/Core/Resolvers/SqlQueryEngine.cs b/src/Core/Resolvers/SqlQueryEngine.cs index f567251771..9c98e3465a 100644 --- a/src/Core/Resolvers/SqlQueryEngine.cs +++ b/src/Core/Resolvers/SqlQueryEngine.cs @@ -12,6 +12,7 @@ using Azure.DataApiBuilder.Core.Services; using Azure.DataApiBuilder.Core.Services.Cache; using Azure.DataApiBuilder.Core.Services.MetadataProviders; +using Azure.DataApiBuilder.Core.Services.Embeddings; using Azure.DataApiBuilder.Service.Exceptions; using Azure.DataApiBuilder.Service.GraphQLBuilder; using Azure.DataApiBuilder.Service.GraphQLBuilder.Queries; @@ -37,6 +38,7 @@ public class SqlQueryEngine : IQueryEngine private readonly RuntimeConfigProvider _runtimeConfigProvider; private readonly GQLFilterParser _gQLFilterParser; private readonly DabCacheService _cache; + private readonly IEmbeddingService? _embeddingService; // // Constructor. @@ -49,7 +51,8 @@ public SqlQueryEngine( GQLFilterParser gQLFilterParser, ILogger logger, RuntimeConfigProvider runtimeConfigProvider, - DabCacheService cache) + DabCacheService cache, + IEmbeddingService? embeddingService = null) { _queryFactory = queryFactory; _sqlMetadataProviderFactory = sqlMetadataProviderFactory; @@ -59,6 +62,7 @@ public SqlQueryEngine( _logger = logger; _runtimeConfigProvider = runtimeConfigProvider; _cache = cache; + _embeddingService = embeddingService; } /// @@ -140,6 +144,15 @@ await ExecuteAsync(structure, dataSourceName, isMultipleCreateOperation: true), ISqlMetadataProvider sqlMetadataProvider = _sqlMetadataProviderFactory.GetMetadataProvider(dataSourceName); if (sqlMetadataProvider.GraphQLStoredProcedureExposedNameToEntityNameMap.TryGetValue(context.Selection.Field.Name, out string? entityName)) { + // Phase 3: substitute embed:true parameters with embedding vectors (GraphQL path) + // Helper handles the case where embed params exist but service is null (throws 503). + Entity entity = _runtimeConfigProvider.GetConfig().Entities[entityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + parameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); + SqlExecuteStructure sqlExecuteStructure = new( entityName, sqlMetadataProvider, @@ -198,6 +211,17 @@ await ExecuteAsync(structure, dataSourceName, isMultipleCreateOperation: true), /// public async Task ExecuteAsync(StoredProcedureRequestContext context, string dataSourceName) { + // Phase 3: substitute embed:true parameters with embedding vectors + // before SqlExecuteStructure reads them. The helper replaces text values + // (e.g., "wireless headphones") with vector JSON strings (e.g., "[0.012,...]"). + // Helper handles the case where embed params exist but service is null (throws 503). + Entity entity = _runtimeConfigProvider.GetConfig().Entities[context.EntityName]; + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + context.ResolvedParameters, + entity.Source.Parameters, + _embeddingService, + _httpContextAccessor.HttpContext?.RequestAborted ?? CancellationToken.None); + SqlExecuteStructure structure = new( context.EntityName, _sqlMetadataProviderFactory.GetMetadataProvider(dataSourceName), diff --git a/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs new file mode 100644 index 0000000000..3ef394d81f --- /dev/null +++ b/src/Core/Services/Embeddings/ParameterEmbeddingHelper.cs @@ -0,0 +1,279 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Globalization; +using System.Net; +using System.Text.Json; +using Azure.DataApiBuilder.Config.ObjectModel; +using Azure.DataApiBuilder.Service.Exceptions; + +namespace Azure.DataApiBuilder.Core.Services.Embeddings; + +/// +/// Substitutes text values with embedding vectors for stored procedure parameters marked with embed:true. +/// +/// Called before SqlExecuteStructure construction — the substituted values flow through +/// the normal String type conversion path (thanks to the metadata type override that +/// changes VECTOR params from Byte[] to String during startup). +/// +/// Example: +/// Input: resolvedParams["query_vector"] = "wireless headphones" (user's text) +/// Output: resolvedParams["query_vector"] = "[0.012,-0.045,...,0.083]" (vector JSON string) +/// +public static class ParameterEmbeddingHelper +{ + /// + /// For each parameter marked embed:true in config, replaces the text value in + /// resolvedParams with a serialized vector string. + /// + /// Implementation uses 3 phases for efficiency when a sproc has multiple embed params: + /// 1. COLLECT — validate all embed param values, build a list of texts to embed + /// 2. BATCH — single TryEmbedBatchAsync call instead of N sequential TryEmbedAsync calls + /// 3. SUBSTITUTE — serialize each returned vector and write back into resolvedParams + /// + /// For the common single-embed-param case, batch of 1 is equivalent to sequential. + /// For multi-embed sprocs, this saves ~(N-1) × API_LATENCY on cache miss. + /// + /// The embedding call goes through EmbeddingService which has built-in FusionCache, + /// so repeated identical texts return instantly without calling the AI provider. + /// + /// + /// The parameter dictionary from the request (REST body/query string or GraphQL args). + /// Modified in-place: text values for embed params are replaced with vector JSON strings. + /// + /// + /// Parameter metadata from dab-config.json for this entity. + /// Contains the embed:true flag for each parameter. + /// + /// The embedding service to call for text → vector conversion. + /// Cancellation token from the HTTP request. + public static async Task SubstituteEmbedParametersAsync( + IDictionary resolvedParams, + List? configParams, + IEmbeddingService? embeddingService, + CancellationToken cancellationToken) + { + // Nothing to do if no config params defined + if (configParams is null) + { + return; + } + + // Quick exit: are there any embed params at all in config? + bool hasEmbedParams = configParams.Any(p => p.Embed); + if (!hasEmbedParams) + { + return; + } + + // If we have embed params in config but no embedding service, fail loudly. + // This catches DI misconfiguration and future code paths that construct engines + // without the service. Without this check, the silent-skip behavior would send + // raw text to SQL, producing confusing errors or silently wrong results. + // + // Note: Startup config validation already requires runtime.embeddings to be + // configured and enabled when embed:true is present (see ValidateEmbedParameters). + // This is defense-in-depth for unexpected DI states at runtime. + if (embeddingService is null) + { + throw new DataApiBuilderException( + message: "An embed parameter is configured but the embedding service is not available. Verify runtime.embeddings is configured and enabled.", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + + // ───────────────────────────────────────────────────────────────────── + // PHASE 1: COLLECT — validate each embed param's value and gather texts + // ───────────────────────────────────────────────────────────────────── + // We validate eagerly (per-param) so error messages stay specific: + // "Parameter 'foo' has embed:true but received a non-string value..." + // rather than the batch-level "embed failed for params X, Y, Z." + // + // After this phase: embedRequests has (paramName, text) for every embed + // param the user supplied. Param metadata defines order; we preserve it. + + List<(string paramName, string text)> embedRequests = new(); + + foreach (ParameterMetadata configParam in configParams) + { + // Skip non-embed params — they pass through unchanged + // Example: "top_k" has Embed=false → skip + if (!configParam.Embed) + { + continue; + } + + // Skip embed params not supplied in the request. We don't enforce + // required-ness here because DAB's request validation for sprocs only + // checks for extra fields, not missing ones — see RequestValidator + // .ValidateStoredProcedureRequestContext, which delegates required-param + // detection to SQL Server (no easy way to read default-value metadata + // from sys.parameters in a portable way). + // + // When a required embed param is missing entirely, SqlExecuteStructure + // also won't bind it, and SQL Server returns "expects parameter X, which + // was not supplied." MsSqlDbExceptionParser translates that to a 400 + // DatabaseInputError for the client. So missing values still produce a + // clear, actionable error — just via the SQL-error-relay path rather + // than this helper. + // + // (Explicit null or empty string DOES reach this helper, via the + // IsNullOrWhiteSpace check below — that path produces our own 400 with + // the friendlier "has 'embed: true' but the provided text is empty or + // whitespace" message.) + if (!resolvedParams.TryGetValue(configParam.Name, out object? value)) + { + continue; + } + + // Validate type and extract text (throws 400 for non-strings) + string? text = ExtractTextValue(configParam.Name, value); + + // Validate non-empty (throws 400 for empty/whitespace/null) + if (string.IsNullOrWhiteSpace(text)) + { + throw new DataApiBuilderException( + message: $"Parameter '{configParam.Name}' has 'embed: true' but the provided text is empty or whitespace.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); + } + + embedRequests.Add((configParam.Name, text!)); + } + + // No embed param values supplied in this request — nothing to embed + if (embedRequests.Count == 0) + { + return; + } + + // ───────────────────────────────────────────────────────────────────── + // PHASE 2: BATCH — single API call for all texts at once + // ───────────────────────────────────────────────────────────────────── + // EmbeddingService.TryEmbedBatchAsync (Phase 1 infra) does its own per-text + // FusionCache check. Texts that hit the cache don't trigger API calls; + // only uncached texts go to Azure OpenAI in a single batched request. + // + // For 1 text: equivalent to TryEmbedAsync (no overhead). + // For N texts: 1 API call instead of N sequential ones (saves ~(N-1) × latency). + + string[] texts = embedRequests.Select(r => r.text).ToArray(); + EmbeddingBatchResult batchResult = await embeddingService.TryEmbedBatchAsync(texts, cancellationToken); + + if (!batchResult.Success || batchResult.Embeddings is null) + { + // Batch failure: we lose per-param error specificity here, but the + // batch result's ErrorMessage typically explains the underlying issue. + // Naming all involved params helps the user identify the request context. + string paramNames = string.Join(", ", embedRequests.Select(r => $"'{r.paramName}'")); + throw new DataApiBuilderException( + message: $"Failed to generate embeddings for parameter(s) {paramNames}.", + statusCode: HttpStatusCode.InternalServerError, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + + // Defensive: TryEmbedBatchAsync should return one embedding per input text + if (batchResult.Embeddings.Length != embedRequests.Count) + { + throw new DataApiBuilderException( + message: $"Embedding service returned {batchResult.Embeddings.Length} embeddings but {embedRequests.Count} were requested.", + statusCode: HttpStatusCode.InternalServerError, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + + // ───────────────────────────────────────────────────────────────────── + // PHASE 3: SUBSTITUTE — serialize each vector and write back into resolvedParams + // ───────────────────────────────────────────────────────────────────── + // Order is preserved: embedRequests[i] corresponds to batchResult.Embeddings[i]. + + for (int i = 0; i < embedRequests.Count; i++) + { + float[] embedding = batchResult.Embeddings[i]; + if (embedding is null || embedding.Length == 0) + { + throw new DataApiBuilderException( + message: $"Embedding service returned an empty vector for parameter '{embedRequests[i].paramName}'.", + statusCode: HttpStatusCode.InternalServerError, + subStatusCode: DataApiBuilderException.SubStatusCodes.UnexpectedError); + } + + // Serialize float[] to JSON string that Azure SQL accepts as VECTOR + // Example: float[] { 0.012f, -0.045f, 0.083f } → "[0.012,-0.045,0.083]" + // + // Uses InvariantCulture to prevent European locales from using comma as decimal separator + // (e.g., German locale would produce "0,012" instead of "0.012" → invalid JSON) + // + // Uses "G9" format specifier (NOT "G" or "R"): + // - "G" defaults to G7 for Single, which is NOT round-trippable (~30% of values lose precision) + // - "R" is documented to fail round-trip in some cases for Single + // (per Microsoft docs: "We recommend that you use the G9 format specifier instead.") + // - "G9" guarantees the string can be parsed back to the exact same float + // - Embeddings are precision-sensitive — even tiny drift affects cosine similarity scores + string vectorJson = "[" + + string.Join(",", embedding.Select(f => f.ToString("G9", CultureInfo.InvariantCulture))) + + "]"; + + // Replace the text value with the vector string in-place + // SqlExecuteStructure will see this as a String value (thanks to metadata override) + // and pass it through to SQL, which auto-casts to VECTOR(N) + resolvedParams[embedRequests[i].paramName] = vectorJson; + } + } + + /// + /// Validates that the parameter value is a string (or JsonElement of kind String) and extracts it. + /// + /// Azure OpenAI's embedding API only accepts strings as input — passing a number, + /// boolean, array, or object would either be silently stringified into garbage + /// (e.g., "System.Object[]") or rejected by the API with a confusing error. + /// + /// DAB delivers parameter values as either System.String OR System.Text.Json.JsonElement + /// (the JSON parser wraps body values in JsonElement). We accept both string and + /// JsonElement-of-kind-String, and reject all other types with a clear 400. + /// + /// Example FAIL: { "query_vector": 12345 } → JsonElement(Number) → 400 "must be a string" + /// Example FAIL: { "query_vector": true } → JsonElement(True) → 400 "must be a string" + /// Example FAIL: { "query_vector": ["a","b"] } → JsonElement(Array) → 400 "must be a string" + /// Example FAIL: { "query_vector": {"foo":"bar"} } → JsonElement(Object) → 400 "must be a string" + /// Example PASS: { "query_vector": "headphones" } → JsonElement(String) or string → returns text + /// + /// Note: GraphQL is automatically protected since the embed param is exposed as + /// GraphQL String type (via Stage 2 metadata override) — non-strings are rejected + /// by the GraphQL parser before reaching this code. + /// + private static string? ExtractTextValue(string paramName, object? value) + { + if (value is string s) + { + return s; + } + + if (value is JsonElement jsonElement) + { + if (jsonElement.ValueKind == JsonValueKind.String) + { + return jsonElement.GetString(); + } + + if (jsonElement.ValueKind == JsonValueKind.Null) + { + return null; // Will be caught by IsNullOrWhiteSpace in caller + } + + throw new DataApiBuilderException( + message: $"Parameter '{paramName}' has 'embed: true' but received a non-string JSON value of kind '{jsonElement.ValueKind}'. Embed parameters must be JSON strings.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); + } + + if (value is not null) + { + throw new DataApiBuilderException( + message: $"Parameter '{paramName}' has 'embed: true' but received a non-string value of type '{value.GetType().Name}'. Embed parameters must be JSON strings.", + statusCode: HttpStatusCode.BadRequest, + subStatusCode: DataApiBuilderException.SubStatusCodes.BadRequest); + } + + return null; // null value will be caught by IsNullOrWhiteSpace in caller + } +} diff --git a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs index cdb54a2ac2..583c59430a 100644 --- a/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs +++ b/src/Core/Services/MetadataProviders/MsSqlMetadataProvider.cs @@ -223,6 +223,17 @@ protected override async Task FillSchemaForStoredProcedureAsync( parameterDefinition.Default = paramMetadata.Default; parameterDefinition.HasConfigDefault = paramMetadata.Default is not null; parameterDefinition.ConfigDefaultValue = paramMetadata.Default?.ToString(); + + // Phase 3: validate and override types for embed:true parameters. + // Logic extracted into a testable helper. See ApplyEmbedTypeOverride + // for the full rationale (Byte[] is our VECTOR(N) signal, override + // applies the String type pipeline, etc.). + ApplyEmbedTypeOverride( + parameterDefinition, + paramMetadata, + schemaName: schemaName, + storedProcedureName: storedProcedureSourceName, + parameterName: configParamKey); } } } @@ -231,6 +242,92 @@ protected override async Task FillSchemaForStoredProcedureAsync( GraphQLStoredProcedureExposedNameToEntityNameMap.TryAdd(GenerateStoredProcedureGraphQLFieldName(entityName, procedureEntity), entityName); } + /// + /// Phase 3: For a stored procedure parameter declared with embed: true in config, + /// validates that the parameter is VECTOR-shaped (reported by SQL Server as Byte[]) + /// and overrides its metadata type so the embedding-substituted JSON string flows through + /// DAB's String type pipeline at request time. SQL Server auto-casts NVARCHAR → VECTOR + /// at execution time. + /// + /// + /// The parameter definition to mutate. On entry, its SystemType reflects the SQL + /// Server-reported type (typeof(byte[]) for VECTOR(N) params). On exit, if + /// .Embed is true and the type was Byte[], the SystemType, + /// DbType, and SqlDbType are overridden to String / String / NVarChar respectively. + /// + /// The config-side metadata; embed flag determines behavior. + /// Schema name (e.g. "dbo"), for error messages. + /// Stored-procedure name, for error messages. + /// Parameter name (without leading @), for error messages. + /// + /// Thrown at startup with HTTP 503 if embed: true is set but the parameter is not + /// VECTOR-shaped (i.e., not Byte[]). Catches NVARCHAR/INT/DATETIME misconfigurations + /// before they cause silent wrong results or confusing errors at request time. + /// + /// + /// SQL Server reports VECTOR(N) as "varbinary" via INFORMATION_SCHEMA.PARAMETERS, + /// which DAB maps to typeof(byte[]). We use this Byte[] mapping as our signal + /// that the parameter is VECTOR-shaped. If the param is NOT Byte[], the + /// developer has misconfigured — embed only makes sense for VECTOR parameters. + /// + /// Why fail at startup instead of letting it silently break at request time: + /// - embed:true on NVARCHAR → vector JSON treated as text → empty/wrong results, no error + /// - embed:true on INT/DATETIME → SQL conversion error at request time + /// - embed:true on VECTOR(N) → works correctly (SQL reports as Byte[]) + /// Catching this at startup gives a clear error message before any user is impacted. + /// + /// KNOWN LIMITATION: SQL Server's INFORMATION_SCHEMA.PARAMETERS reports both + /// VECTOR(N) and varbinary(N) as the same "varbinary" type. We cannot + /// distinguish them here. If a developer mistakenly applies embed:true to + /// a real varbinary parameter (e.g., image blob), this override will fire + /// and the request will fail at SQL execution with an "implicit conversion + /// not allowed" error. That failure is loud and clear, so we accept it. + /// Documented in ParameterMetadata.Embed and the JSON schema. + /// + /// (Detection IS possible via sys.parameters which distinguishes vector + /// from varbinary by user_type_id, but adding a separate query for this + /// would require infrastructure changes beyond Phase 3 scope. Filed as + /// future enhancement.) + /// + /// Follows the same pattern as the DateTime DbType override above. + /// + /// Internal static (rather than private inline) to enable direct unit testing + /// via the InternalsVisibleTo attribute on Azure.DataApiBuilder.Core. + /// + internal static void ApplyEmbedTypeOverride( + ParameterDefinition parameterDefinition, + ParameterMetadata paramMetadata, + string schemaName, + string storedProcedureName, + string parameterName) + { + if (!paramMetadata.Embed) + { + return; + } + + // Reject embed:true on non-VECTOR-shaped params at startup with a + // clear error. This catches NVARCHAR/INT/DATETIME misconfigurations + // that would otherwise silently produce wrong results at request time. + if (parameterDefinition.SystemType != typeof(byte[])) + { + throw new DataApiBuilderException( + message: $"Procedure '{schemaName}.{storedProcedureName}': " + + $"parameter '{parameterName}' has 'embed: true' in config but is " + + $"declared as '{parameterDefinition.SystemType.Name}' in the stored procedure. " + + $"Embed parameters must be declared as VECTOR(N).", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: DataApiBuilderException.SubStatusCodes.ErrorInInitialization); + } + + // Override the metadata type so the vector JSON string flows through + // DAB's String type pipeline. SQL Server auto-casts NVARCHAR → VECTOR + // at execution time. + parameterDefinition.SystemType = typeof(string); + parameterDefinition.DbType = System.Data.DbType.String; + parameterDefinition.SqlDbType = System.Data.SqlDbType.NVarChar; + } + /// protected override void PopulateMetadataForLinkingObject( string entityName, diff --git a/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs b/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs index bbb4874d1a..77da4b7156 100644 --- a/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs +++ b/src/Service.Tests/UnitTests/ConfigValidationUnitTests.cs @@ -3579,6 +3579,333 @@ public void ValidateEmbeddingsOptions_EndpointDisabled_SkipsValidation() configValidator.ValidateEmbeddingsOptions(runtimeConfig); } + #region Embed Parameters Validation + + // ───────────────────────────────────────────────────────────────── + // Tests for RuntimeConfigValidator.ValidateEmbedParameters (Phase 3) + // Covers the four startup rules: + // Rule 0 — embed:true requires MSSQL data source + // Rule 1 — embed:true requires stored-procedure entity + // Rule 2 — embed:true requires runtime.embeddings configured & enabled + // Rule 3 — embed:true cannot be combined with a `default` value + // Plus multi-entity / message-content checks. + // ───────────────────────────────────────────────────────────────── + + /// + /// Builds a valid instance with embeddings enabled. + /// Used to satisfy Rule 2 in tests that target other rules. + /// + private static EmbeddingsOptions BuildEmbeddingsEnabled() => + new( + Provider: EmbeddingProviderType.AzureOpenAI, + BaseUrl: "https://test.openai.azure.com", + ApiKey: "test-key", + Enabled: true, + Model: "text-embedding-ada-002"); + + /// + /// Builds a stored-procedure with one parameter whose + /// Embed flag and Default value are caller-controlled. + /// + private static Entity BuildSprocEntity( + string sprocObject, + string paramName, + bool embed, + string defaultValue = null) + { + return new( + Source: new( + Type: EntitySourceType.StoredProcedure, + Object: sprocObject, + Parameters: new List + { + new() { Name = paramName, Embed = embed, Default = defaultValue } + }, + KeyFields: null), + Fields: null, + Rest: new(EntityRestOptions.DEFAULT_HTTP_VERBS_ENABLED_FOR_SP), + GraphQL: new($"{paramName}_singular", $"{paramName}_plural"), + Permissions: new EntityPermission[] + { + new( + Role: "anonymous", + Actions: new[] { new EntityAction(EntityActionOperation.Execute, null, new()) }) + }, + Relationships: null, + Mappings: null); + } + + /// + /// Builds an entity of the given source type (Table / View / StoredProcedure) + /// with one embed:true parameter. Used to test that embed:true is rejected + /// on non-stored-procedure entities (Rule 1). + /// + private static Entity BuildEntityWithSourceType( + EntitySourceType sourceType, + string objectName, + string paramName) + { + return new( + Source: new( + Type: sourceType, + Object: objectName, + Parameters: new List + { + new() { Name = paramName, Embed = true } + }, + KeyFields: null), + Fields: null, + Rest: new(), + GraphQL: new("entity", "entities"), + Permissions: new EntityPermission[] + { + new( + Role: "anonymous", + Actions: new[] { new EntityAction(EntityActionOperation.Read, null, new()) }) + }, + Relationships: null, + Mappings: null); + } + + /// + /// Builds a wrapping the given entities, with + /// caller-controlled data source type and embeddings configuration. + /// + /// Passing = null literally produces a config with + /// runtime.embeddings = null (the "section missing" scenario for Rule 2). Tests + /// that want the happy-path embeddings should pass BuildEmbeddingsEnabled(). + /// + private static RuntimeConfig BuildRuntimeConfigForEmbedTest( + Dictionary entities, + DatabaseType databaseType = DatabaseType.MSSQL, + EmbeddingsOptions embeddings = null) + { + return new( + Schema: "UnitTestSchema", + DataSource: new DataSource(databaseType, "", Options: null), + Runtime: new( + Rest: new(), + GraphQL: new(), + Mcp: new(), + Host: new(null, null), + Embeddings: embeddings), + Entities: new(entities)); + } + + /// + /// Rule 1: embed:true on a stored-procedure entity (with all other rules satisfied) + /// is the happy path. The validator should NOT throw and should record no + /// validation exceptions. + /// + [TestMethod] + public void ValidateEmbedParameters_EmbedOnStoredProcedure_NoError() + { + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + // Should not throw + validator.ValidateEmbedParameters(config); + + // And no validation exceptions should have been recorded. + // (Defensive: in default mode HandleOrRecordException throws rather than + // appending, so the absence of a throw above already guarantees Count == 0. + // This explicit check documents the success criterion and would catch a + // future change that switched to record-only mode without our notice.) + Assert.AreEqual(0, validator.ConfigValidationExceptions.Count, + "Expected no validation exceptions for the happy path"); + + // Sanity: the entity under test is wired correctly so a future refactor + // can't make this test pass by accidentally short-circuiting. + Assert.AreEqual(EntitySourceType.StoredProcedure, config.Entities["SearchProducts"].Source.Type); + Assert.IsTrue(config.Entities["SearchProducts"].Source.Parameters!.Any(p => p.Embed), + "Test setup expected at least one embed:true parameter"); + } + + /// + /// Rule 1: embed:true on a Table or View entity must throw a config-validation + /// exception naming the offending entity, parameter, and source-type constraint. + /// + [DataTestMethod] + [DataRow(EntitySourceType.Table, DisplayName = "Rule 1: embed:true on Table → 503")] + [DataRow(EntitySourceType.View, DisplayName = "Rule 1: embed:true on View → 503")] + public void ValidateEmbedParameters_EmbedOnNonSproc_ThrowsConfigError(EntitySourceType sourceType) + { + Dictionary entities = new() + { + { "Product", BuildEntityWithSourceType(sourceType, "products", "q") } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "Product"); + StringAssert.Contains(ex.Message, "q"); + StringAssert.Contains(ex.Message, "stored-procedure"); + } + + /// + /// Rule 2: embed:true with embeddings disabled (or absent) must throw a + /// config-validation exception. Two flavors: section present but enabled=false, + /// and section completely absent (null). + /// + [DataTestMethod] + [DataRow(false, DisplayName = "Rule 2: embeddings.enabled=false → 503")] + [DataRow(true, DisplayName = "Rule 2: embeddings section missing → 503")] + public void ValidateEmbedParameters_EmbedTrue_EmbeddingsNotConfigured_ThrowsConfigError(bool nullEmbeddings) + { + EmbeddingsOptions embeddings = nullEmbeddings + ? null + : new EmbeddingsOptions( + Provider: EmbeddingProviderType.OpenAI, + BaseUrl: "https://api.openai.com", + ApiKey: "key", + Enabled: false); + + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: embeddings); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "SearchProducts"); + StringAssert.Contains(ex.Message, "q"); + StringAssert.Contains(ex.Message, "runtime.embeddings"); + } + + /// + /// Rule 3: embed:true combined with a default value is not supported. + /// Should throw a config-validation exception even though all other rules pass. + /// + [TestMethod] + public void ValidateEmbedParameters_EmbedTrue_WithDefault_ThrowsConfigError() + { + Dictionary entities = new() + { + { + "SearchProducts", + BuildSprocEntity("sp_search", "q", embed: true, defaultValue: "wireless headphones") + } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "SearchProducts"); + StringAssert.Contains(ex.Message, "q"); + StringAssert.Contains(ex.Message, "default"); + } + + /// + /// Rule 3 happy path: embed:true with no default value is allowed. The validator + /// should NOT throw and should record no validation exceptions. + /// + [TestMethod] + public void ValidateEmbedParameters_EmbedTrue_WithoutDefault_NoError() + { + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true, defaultValue: null) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + // Should not throw + validator.ValidateEmbedParameters(config); + + // And no validation exceptions should have been recorded. + Assert.AreEqual(0, validator.ConfigValidationExceptions.Count, + "Expected no validation exceptions when embed:true is paired with no default"); + + // Sanity: the embed param under test was set up with no default, as intended. + ParameterMetadata embedParam = config.Entities["SearchProducts"].Source.Parameters! + .Single(p => p.Embed); + Assert.IsNull(embedParam.Default, "Test setup expected the embed param to have no default"); + } + + /// + /// Rule 0: embed:true is only supported on MSSQL data sources. The validator + /// should reject embed:true on PostgreSQL, MySQL, DWSQL, or CosmosDB. + /// + [DataTestMethod] + [DataRow(DatabaseType.PostgreSQL, DisplayName = "Rule 0: embed:true on PostgreSQL → 503")] + [DataRow(DatabaseType.MySQL, DisplayName = "Rule 0: embed:true on MySQL → 503")] + public void ValidateEmbedParameters_EmbedTrue_NonMssqlDataSource_ThrowsConfigError(DatabaseType dbType) + { + Dictionary entities = new() + { + { "SearchProducts", BuildSprocEntity("sp_search", "q", embed: true) } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, databaseType: dbType, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ConfigValidationError, ex.SubStatusCode); + StringAssert.Contains(ex.Message, dbType.ToString()); + StringAssert.Contains(ex.Message, "Azure SQL"); + } + + /// + /// When multiple entities exist and ONE has an embed-rule violation, the + /// thrown exception must clearly identify the offending entity and parameter + /// (so the user can fix their config without grep-and-guess). + /// + [TestMethod] + public void ValidateEmbedParameters_MultipleEntities_OneViolates_NamesViolatingEntityAndParam() + { + Dictionary entities = new() + { + // Healthy entities + { "OkProducts", BuildSprocEntity("sp_ok", "q", embed: true) }, + { "AlsoOk", BuildSprocEntity("sp_other", "input", embed: true) }, + // The violator: embed:true on a table + { "BadEntity", BuildEntityWithSourceType(EntitySourceType.Table, "bad_table", "queryParam") } + }; + RuntimeConfig config = BuildRuntimeConfigForEmbedTest(entities, embeddings: BuildEmbeddingsEnabled()); + + RuntimeConfigValidator validator = InitializeRuntimeConfigValidator(); + + DataApiBuilderException ex = Assert.ThrowsException( + () => validator.ValidateEmbedParameters(config)); + + // Must name the offending entity and param specifically + StringAssert.Contains(ex.Message, "BadEntity"); + StringAssert.Contains(ex.Message, "queryParam"); + // Must not name the healthy ones + Assert.IsFalse(ex.Message.Contains("OkProducts"), + "Error should not mention healthy entities; was: " + ex.Message); + Assert.IsFalse(ex.Message.Contains("AlsoOk"), + "Error should not mention healthy entities; was: " + ex.Message); + } + + #endregion + private static RuntimeConfigValidator InitializeRuntimeConfigValidator() { MockFileSystem fileSystem = new(); diff --git a/src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs b/src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs new file mode 100644 index 0000000000..8f58f9e2d5 --- /dev/null +++ b/src/Service.Tests/UnitTests/ParameterEmbeddingHelperTests.cs @@ -0,0 +1,749 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#nullable enable + +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Net; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Azure.DataApiBuilder.Config.ObjectModel; +using Azure.DataApiBuilder.Core.Services.Embeddings; +using Azure.DataApiBuilder.Service.Exceptions; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using Moq; + +namespace Azure.DataApiBuilder.Service.Tests.UnitTests; + +/// +/// Unit tests for . +/// +/// Covers all behaviors of the helper that converts text-valued sproc parameters +/// (declared with embed: true in config) into vector JSON strings via a +/// mocked . +/// +/// Test groups (organized into #region blocks below): +/// 1. No-Op Cases — early-exit paths that never touch the service +/// 2. Service Availability — null IEmbeddingService handling +/// 3. Input Type Validation — accepted vs rejected value types +/// 4. Empty And Whitespace — empty / whitespace / null text rejection +/// 5. Batching Behavior — proves single batch call (not N sequential) +/// 6. Batch Result Handling — success, failure, length mismatch, null vector +/// 7. Output Format And Cancellation — G9 + InvariantCulture; cancellation token forwarding +/// +[TestClass] +public class ParameterEmbeddingHelperTests +{ + private Mock _mockService = null!; + + [TestInitialize] + public void Setup() + { + _mockService = new Mock(MockBehavior.Strict); + } + + // ───────────────────────────────────────────────────────────────────── + // Helper factories — keep test bodies focused on what's being tested + // ───────────────────────────────────────────────────────────────────── + + private static ParameterMetadata EmbedParam(string name) => + new() { Name = name, Embed = true }; + + private static ParameterMetadata NormalParam(string name) => + new() { Name = name, Embed = false }; + + /// + /// Parses a JSON literal into a standalone that survives + /// after the source is disposed (via Clone). + /// + private static JsonElement JsonElementFrom(string json) + { + using JsonDocument doc = JsonDocument.Parse(json); + return doc.RootElement.Clone(); + } + + /// + /// Configures the mock to return a successful batch result with the given embeddings. + /// + private void SetupBatch(params float[][] embeddings) + { + _mockService + .Setup(s => s.TryEmbedBatchAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new EmbeddingBatchResult(true, embeddings, null)); + } + + /// + /// Verifies the batch method was called exactly once AND that single-text TryEmbedAsync + /// was never called (proves we use the batched path, not sequential per-param calls). + /// + private void VerifyBatchedExactlyOnce() + { + _mockService.Verify( + s => s.TryEmbedBatchAsync(It.IsAny(), It.IsAny()), + Times.Once); + _mockService.Verify( + s => s.TryEmbedAsync(It.IsAny(), It.IsAny()), + Times.Never); + } + + #region No-Op Cases + + /// + /// When configParams is null (entity has no <parameters> section in config), + /// the helper should return immediately without calling the embedding service. + /// + [TestMethod] + public async Task NullConfigParams_ReturnsImmediately_NoServiceCall() + { + Dictionary resolvedParams = new() { { "x", "anything" } }; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams: null, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + Assert.AreEqual("anything", resolvedParams["x"]); + } + + /// + /// When configParams contains parameters but none have embed:true, the helper + /// should return without calling the embedding service. + /// + [TestMethod] + public async Task NoEmbedParams_ReturnsImmediately_NoServiceCall() + { + Dictionary resolvedParams = new() { { "x", "value" }, { "y", 42 } }; + List configParams = new() { NormalParam("x"), NormalParam("y") }; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + Assert.AreEqual("value", resolvedParams["x"]); + Assert.AreEqual(42, resolvedParams["y"]); + } + + /// + /// When embed params are configured but none are supplied in the request, the + /// helper should reach the early-return after Phase 1 collect (embedRequests.Count == 0) + /// without calling the service. + /// + [TestMethod] + public async Task EmbedParamsConfiguredButNoneSupplied_ReturnsAfterCollect_NoServiceCall() + { + Dictionary resolvedParams = new() { { "other", 5 } }; + List configParams = new() { EmbedParam("missing"), NormalParam("other") }; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + Assert.AreEqual(5, resolvedParams["other"]); + } + + /// + /// When configParams is an empty list, the helper should return immediately + /// (the "any embed param?" check exits before any iteration). + /// + [TestMethod] + public async Task EmptyConfigParamsList_ReturnsImmediately_NoServiceCall() + { + Dictionary resolvedParams = new() { { "x", "value" } }; + List configParams = new(); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.VerifyNoOtherCalls(); + } + + #endregion + + #region Service Availability + + /// + /// When the embedding service is null AND embed params are configured, the helper + /// should throw a 503 immediately (defense-in-depth check beyond config validation). + /// + [TestMethod] + public async Task NullService_WithEmbedParams_Throws503() + { + Dictionary resolvedParams = new() { { "q", "hello" } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, embeddingService: null, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + StringAssert.Contains(ex.Message, "embed parameter"); + StringAssert.Contains(ex.Message, "embedding service"); + } + + /// + /// When the embedding service is null but no embed params are configured, the helper + /// should NOT throw — the no-op early exit fires before the null-service check. + /// (Backward compat: existing entities without embed params work without an embedding service.) + /// + [TestMethod] + public async Task NullService_WithoutEmbedParams_NoThrow() + { + Dictionary resolvedParams = new() { { "x", "value" } }; + List configParams = new() { NormalParam("x") }; + + // Should not throw + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, embeddingService: null, CancellationToken.None); + + Assert.AreEqual("value", resolvedParams["x"]); + } + + #endregion + + #region Input Type Validation + + /// + /// A plain System.String value is accepted directly and embedded. + /// + [TestMethod] + public async Task PlainString_AcceptsAndEmbeds() + { + Dictionary resolvedParams = new() { { "q", "wireless headphones" } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.5f, 0.25f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 1 && arr[0] == "wireless headphones"), + It.IsAny()), + Times.Once); + Assert.AreEqual("[0.5,0.25]", resolvedParams["q"]); + } + + /// + /// A JsonElement of kind String (the JSON parser's representation of body strings) + /// is accepted and embedded. + /// + [TestMethod] + public async Task JsonElementString_AcceptsAndEmbeds() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("\"hello\"") } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.5f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 1 && arr[0] == "hello"), + It.IsAny()), + Times.Once); + Assert.AreEqual("[0.5]", resolvedParams["q"]); + } + + /// + /// A JsonElement of kind Number (e.g., the user sent a number where text was expected) + /// must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementNumber_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("12345") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Number"); + StringAssert.Contains(ex.Message, "embed: true"); + } + + /// + /// A JsonElement of kind True/False (boolean) must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementBoolean_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("true") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "True"); + } + + /// + /// A JsonElement of kind Array must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementArray_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("[\"a\",\"b\"]") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Array"); + } + + /// + /// A JsonElement of kind Object must be rejected with a 400 BadRequest. + /// + [TestMethod] + public async Task JsonElementObject_Throws400() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("{\"foo\":\"bar\"}") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Object"); + } + + /// + /// A JsonElement of kind Null falls through to the IsNullOrWhiteSpace check and + /// is rejected as empty/whitespace (400). The helper does not throw a different + /// "non-string kind" message for Null since null is semantically "no value supplied". + /// + [TestMethod] + public async Task JsonElementNull_Throws400AsEmpty() + { + Dictionary resolvedParams = new() { { "q", JsonElementFrom("null") } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + /// + /// A non-string, non-JsonElement value (e.g., a boxed int) must be rejected + /// with a 400 BadRequest naming the offending CLR type. + /// + [TestMethod] + public async Task NonStringNonJsonElementValue_Throws400() + { + Dictionary resolvedParams = new() { { "q", (object)42 } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "Int32"); + } + + #endregion + + #region Empty And Whitespace Validation + + /// + /// An explicit empty string value for an embed parameter is rejected with a 400. + /// + [TestMethod] + public async Task EmptyString_Throws400() + { + Dictionary resolvedParams = new() { { "q", "" } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + /// + /// A whitespace-only string value for an embed parameter is rejected with a 400. + /// (IsNullOrWhiteSpace covers strings of only spaces, tabs, etc.) + /// + [TestMethod] + public async Task WhitespaceString_Throws400() + { + Dictionary resolvedParams = new() { { "q", " " } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + /// + /// A C# null value (not JsonElement Null) for an embed parameter is rejected with a 400. + /// ExtractTextValue returns null for a null input; the IsNullOrWhiteSpace check then fires. + /// + [TestMethod] + public async Task NullValue_Throws400() + { + Dictionary resolvedParams = new() { { "q", null } }; + List configParams = new() { EmbedParam("q") }; + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.BadRequest, ex.StatusCode); + StringAssert.Contains(ex.Message, "empty or whitespace"); + } + + #endregion + + #region Batching Behavior + + /// + /// A single embed param results in a single batched call (with one text), NOT a + /// per-param TryEmbedAsync. Even single-param case goes through TryEmbedBatchAsync. + /// + [TestMethod] + public async Task SingleEmbedParam_CallsBatchOnce_NotSequential() + { + Dictionary resolvedParams = new() { { "q", "hello" } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.5f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + VerifyBatchedExactlyOnce(); + // Verify the user's text was replaced with the serialized vector JSON + Assert.AreEqual("[0.5]", resolvedParams["q"]); + } + + /// + /// Multiple embed params (all supplied) result in a SINGLE batched call carrying + /// all texts at once — not N sequential per-param calls. This is the core batching + /// guarantee that addresses Jim's review comment #2. + /// + [TestMethod] + public async Task MultipleEmbedParams_CallsBatchOnce_NotSequential() + { + Dictionary resolvedParams = new() + { + { "p1", "alpha" }, + { "p2", "beta" }, + { "p3", "gamma" } + }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2"), EmbedParam("p3") }; + SetupBatch( + new[] { 0.5f }, new[] { 0.25f }, new[] { 0.125f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + // Exactly one batched call carrying all 3 texts (not 3 sequential calls) + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 3), + It.IsAny()), + Times.Once); + _mockService.Verify( + s => s.TryEmbedAsync(It.IsAny(), It.IsAny()), + Times.Never); + + // Verify each user text was replaced with its corresponding serialized vector + Assert.AreEqual("[0.5]", resolvedParams["p1"]); + Assert.AreEqual("[0.25]", resolvedParams["p2"]); + Assert.AreEqual("[0.125]", resolvedParams["p3"]); + } + + /// + /// When configParams contains a mix of embed:true and embed:false params, only the + /// embed:true ones are batched. Non-embed params pass through untouched. + /// + [TestMethod] + public async Task MixedEmbedAndNonEmbed_OnlyEmbedTextsBatched() + { + Dictionary resolvedParams = new() + { + { "embedA", "search text 1" }, + { "normal1", 5 }, + { "embedB", "search text 2" }, + { "normal2", "literal" } + }; + List configParams = new() + { + EmbedParam("embedA"), + NormalParam("normal1"), + EmbedParam("embedB"), + NormalParam("normal2") + }; + SetupBatch(new[] { 0.5f }, new[] { 0.25f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 2 && arr[0] == "search text 1" && arr[1] == "search text 2"), + It.IsAny()), + Times.Once); + + // Non-embed params pass through untouched + Assert.AreEqual(5, resolvedParams["normal1"]); + Assert.AreEqual("literal", resolvedParams["normal2"]); + // Embed params replaced with vector JSON + Assert.AreEqual("[0.5]", resolvedParams["embedA"]); + Assert.AreEqual("[0.25]", resolvedParams["embedB"]); + } + + /// + /// When configParams has multiple embed entries [A, B, C], the texts in the batch + /// call must be in that same order — the Substitute phase relies on index alignment + /// between the request list and the response list. + /// + [TestMethod] + public async Task MultipleEmbedParams_OrderPreserved_BatchTextsMatchConfigOrder() + { + Dictionary resolvedParams = new() + { + { "C", "third" }, + { "A", "first" }, + { "B", "second" } + }; + // configParams declared in order A, B, C — this is the order that should be preserved. + List configParams = new() { EmbedParam("A"), EmbedParam("B"), EmbedParam("C") }; + SetupBatch(new[] { 0.5f }, new[] { 0.25f }, new[] { 0.125f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => + arr.Length == 3 && arr[0] == "first" && arr[1] == "second" && arr[2] == "third"), + It.IsAny()), + Times.Once); + + // And substitution preserves order: A → 0.5, B → 0.25, C → 0.125 + Assert.AreEqual("[0.5]", resolvedParams["A"]); + Assert.AreEqual("[0.25]", resolvedParams["B"]); + Assert.AreEqual("[0.125]", resolvedParams["C"]); + } + + /// + /// When 3 embed params are configured but only 2 are supplied in the request, + /// the batch should contain only the supplied subset. The third param is left + /// for downstream handling (SQL-level required-param error). + /// + [TestMethod] + public async Task PartiallySuppliedEmbedParams_BatchesSubsetOnly() + { + Dictionary resolvedParams = new() + { + { "p1", "supplied 1" }, + { "p3", "supplied 3" } + // p2 is missing + }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2"), EmbedParam("p3") }; + SetupBatch(new[] { 0.5f }, new[] { 0.125f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.Is(arr => arr.Length == 2 && arr[0] == "supplied 1" && arr[1] == "supplied 3"), + It.IsAny()), + Times.Once); + + Assert.AreEqual("[0.5]", resolvedParams["p1"]); + Assert.AreEqual("[0.125]", resolvedParams["p3"]); + Assert.IsFalse(resolvedParams.ContainsKey("p2")); + } + + #endregion + + #region Batch Result Handling + + /// + /// A successful batch result should produce vector JSON substitutions in resolvedParams + /// — happy-path coverage for the substitute phase. + /// + [TestMethod] + public async Task BatchSuccess_VectorsSubstitutedInResolvedParams() + { + Dictionary resolvedParams = new() { { "p1", "x" }, { "p2", "y" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + SetupBatch(new[] { 1.0f, 2.0f }, new[] { 3.0f, 4.0f }); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + + Assert.AreEqual("[1,2]", resolvedParams["p1"]); + Assert.AreEqual("[3,4]", resolvedParams["p2"]); + } + + /// + /// A failed batch result (Success: false) should throw a 500 InternalServerError + /// listing all involved param names so the caller can identify request context. + /// + [TestMethod] + public async Task BatchFailure_Throws500_WithAllParamNames() + { + Dictionary resolvedParams = new() { { "p1", "a" }, { "p2", "b" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + _mockService + .Setup(s => s.TryEmbedBatchAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(new EmbeddingBatchResult(false, null, "OpenAI rate limit")); + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.InternalServerError, ex.StatusCode); + StringAssert.Contains(ex.Message, "p1"); + StringAssert.Contains(ex.Message, "p2"); + } + + /// + /// A successful batch result with the wrong embedding count (service contract violation) + /// should throw a 500 InternalServerError mentioning the count mismatch, instead of + /// silently using the wrong vectors. + /// + [TestMethod] + public async Task BatchLengthMismatch_Throws500() + { + Dictionary resolvedParams = new() { { "p1", "a" }, { "p2", "b" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + // Service returns ONE embedding when TWO were requested + SetupBatch(new[] { 0.1f }); + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.InternalServerError, ex.StatusCode); + StringAssert.Contains(ex.Message, "1"); + StringAssert.Contains(ex.Message, "2"); + } + + /// + /// A batch with one valid and one empty embedding (e.g., empty array slot) should + /// throw a 500 naming the specific param whose embedding was empty. + /// + [TestMethod] + public async Task IndividualEmbeddingEmpty_Throws500_NamingFailedParam() + { + Dictionary resolvedParams = new() { { "p1", "a" }, { "p2", "b" } }; + List configParams = new() { EmbedParam("p1"), EmbedParam("p2") }; + // Second embedding is empty — defensive check should fire + SetupBatch(new[] { 0.1f }, System.Array.Empty()); + + DataApiBuilderException ex = await Assert.ThrowsExceptionAsync( + () => ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None)); + + Assert.AreEqual(HttpStatusCode.InternalServerError, ex.StatusCode); + StringAssert.Contains(ex.Message, "p2"); + } + + #endregion + + #region Output Format And Cancellation + + /// + /// The vector JSON output must use G9 float formatting and InvariantCulture to ensure + /// round-trippability and locale-independent decimal separators. This is critical for + /// embedding precision (G7 default would lose ~30% of values per Microsoft docs). + /// + /// Tests: + /// - No spaces between values + /// - Period (.) as decimal separator regardless of thread culture + /// - G9 precision preserved (a value that requires G9 to round-trip) + /// + [TestMethod] + public async Task VectorJson_UsesG9AndInvariantCulture() + { + Dictionary resolvedParams = new() { { "q", "any" } }; + List configParams = new() { EmbedParam("q") }; + + // 0.1f cannot be exactly represented in binary float; G9 preserves enough digits + // to round-trip. G7 (the "G" default) would give "0.1" which loses precision. + SetupBatch(new[] { 0.1f, -0.2f, 0.0001234567f }); + + // Save and switch culture to ensure InvariantCulture is enforced + CultureInfo originalCulture = CultureInfo.CurrentCulture; + try + { + // German locale uses comma as decimal separator — proves we're using InvariantCulture + CultureInfo.CurrentCulture = new CultureInfo("de-DE"); + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, CancellationToken.None); + } + finally + { + CultureInfo.CurrentCulture = originalCulture; + } + + string vectorJson = (string)resolvedParams["q"]!; + + // No spaces + Assert.IsFalse(vectorJson.Contains(" "), "Vector JSON should not contain spaces"); + // Period decimal separator (not German comma) + Assert.IsTrue(vectorJson.Contains("0.1"), "Vector JSON should use period as decimal separator"); + // Bracketed + Assert.IsTrue(vectorJson.StartsWith("["), "Vector JSON should start with ["); + Assert.IsTrue(vectorJson.EndsWith("]"), "Vector JSON should end with ]"); + // Comma-separated values + Assert.AreEqual(3, vectorJson.Split(',').Length, "Vector JSON should have 3 comma-separated values"); + + // Round-trip the parsed values to confirm G9 precision + string trimmed = vectorJson.Substring(1, vectorJson.Length - 2); + string[] parts = trimmed.Split(','); + Assert.AreEqual(0.1f, float.Parse(parts[0], CultureInfo.InvariantCulture)); + Assert.AreEqual(-0.2f, float.Parse(parts[1], CultureInfo.InvariantCulture)); + Assert.AreEqual(0.0001234567f, float.Parse(parts[2], CultureInfo.InvariantCulture)); + } + + /// + /// The CancellationToken passed to the helper must be forwarded to the embedding + /// service's batch call so request cancellation propagates correctly. + /// + [TestMethod] + public async Task CancellationToken_ForwardedToEmbeddingService() + { + Dictionary resolvedParams = new() { { "q", "hello" } }; + List configParams = new() { EmbedParam("q") }; + SetupBatch(new[] { 0.1f }); + + using CancellationTokenSource cts = new(); + CancellationToken token = cts.Token; + + await ParameterEmbeddingHelper.SubstituteEmbedParametersAsync( + resolvedParams, configParams, _mockService.Object, token); + + _mockService.Verify( + s => s.TryEmbedBatchAsync( + It.IsAny(), + It.Is(ct => ct == token)), + Times.Once); + } + + #endregion +} diff --git a/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs b/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs index dd6ad7d27e..d15985d5dd 100644 --- a/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs +++ b/src/Service.Tests/UnitTests/SqlMetadataProviderUnitTests.cs @@ -670,5 +670,205 @@ public async Task CheckAutoentitiesQuery(string[] include, string[] exclude, str TestHelper.UnsetAllDABEnvironmentVariables(); } + + #region Embed Type Override + + // ───────────────────────────────────────────────────────────────── + // Tests for MsSqlMetadataProvider.ApplyEmbedTypeOverride (Phase 3) + // + // The helper is invoked at startup for each parameter defined in + // config for a stored procedure. When embed:true is set, it: + // - validates the parameter is VECTOR-shaped (SystemType == byte[]) + // - overrides SystemType/DbType/SqlDbType to flow as String + // + // These tests construct ParameterDefinition / ParameterMetadata + // pairs directly and invoke the helper — no DB, no DI container, + // no full metadata-provider construction. + // ───────────────────────────────────────────────────────────────── + + /// + /// embed:true on a parameter that SQL Server reports as Byte[] (i.e., VECTOR(N) + /// surfaced via INFORMATION_SCHEMA.PARAMETERS as varbinary) overrides the + /// metadata type so the vector JSON string flows through DAB's String pipeline. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_ByteArrayParam_WithEmbedTrue_OverridesToString() + { + ParameterDefinition def = new() + { + Name = "query_vector", + SystemType = typeof(byte[]) + }; + ParameterMetadata meta = new() { Name = "query_vector", Embed = true }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_search", parameterName: "query_vector"); + + Assert.AreEqual(typeof(string), def.SystemType); + Assert.AreEqual(System.Data.DbType.String, def.DbType); + Assert.AreEqual(System.Data.SqlDbType.NVarChar, def.SqlDbType); + } + + /// + /// embed:true on a parameter that the SQL Server reports as something other than + /// Byte[] — e.g., a real string param — must throw at startup with a 503 and + /// a clear message naming the schema/sproc/parameter and saying VECTOR(N) is required. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_StringParam_WithEmbedTrue_ThrowsAtStartup() + { + ParameterDefinition def = new() + { + Name = "query_text", + SystemType = typeof(string) + }; + ParameterMetadata meta = new() { Name = "query_text", Embed = true }; + + DataApiBuilderException ex = Assert.ThrowsException(() => + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_test", parameterName: "query_text")); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ErrorInInitialization, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "dbo.sp_test"); + StringAssert.Contains(ex.Message, "query_text"); + StringAssert.Contains(ex.Message, "String"); + StringAssert.Contains(ex.Message, "VECTOR"); + } + + /// + /// embed:true on a numeric parameter (e.g., INT) is also rejected with the same + /// 503. Catches a common misconfiguration where embed is applied to a count param. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_IntParam_WithEmbedTrue_ThrowsAtStartup() + { + ParameterDefinition def = new() + { + Name = "top_k", + SystemType = typeof(int) + }; + ParameterMetadata meta = new() { Name = "top_k", Embed = true }; + + DataApiBuilderException ex = Assert.ThrowsException(() => + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_top", parameterName: "top_k")); + + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, ex.StatusCode); + Assert.AreEqual(DataApiBuilderException.SubStatusCodes.ErrorInInitialization, ex.SubStatusCode); + StringAssert.Contains(ex.Message, "Int32"); + StringAssert.Contains(ex.Message, "VECTOR"); + } + + /// + /// embed:false on a Byte[] parameter (e.g., a real varbinary blob like an image) + /// must NOT be touched — the override only fires for embed:true params. The + /// parameter's metadata should be returned exactly as it came in. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_ByteArrayParam_WithEmbedFalse_NoChange() + { + ParameterDefinition def = new() + { + Name = "image_blob", + SystemType = typeof(byte[]), + DbType = System.Data.DbType.Binary, + SqlDbType = System.Data.SqlDbType.VarBinary + }; + ParameterMetadata meta = new() { Name = "image_blob", Embed = false }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_upload", parameterName: "image_blob"); + + // Original type metadata preserved exactly + Assert.AreEqual(typeof(byte[]), def.SystemType); + Assert.AreEqual(System.Data.DbType.Binary, def.DbType); + Assert.AreEqual(System.Data.SqlDbType.VarBinary, def.SqlDbType); + } + + /// + /// Edge case: embed:true behavior is independent of + /// and . The override fires purely on + /// + being byte[]. Other + /// metadata fields are not affected. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_ByteArrayWithRequiredAndDefault_OnlyTypeMetadataChanges() + { + ParameterDefinition def = new() + { + Name = "v", + SystemType = typeof(byte[]), + Required = true, + Default = "should-be-untouched-by-override", + HasConfigDefault = true, + ConfigDefaultValue = "should-be-untouched-by-override" + }; + ParameterMetadata meta = new() { Name = "v", Embed = true }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + def, meta, schemaName: "dbo", storedProcedureName: "sp_v", parameterName: "v"); + + // Type metadata overridden + Assert.AreEqual(typeof(string), def.SystemType); + Assert.AreEqual(System.Data.DbType.String, def.DbType); + Assert.AreEqual(System.Data.SqlDbType.NVarChar, def.SqlDbType); + + // Other fields untouched (note: validator separately rejects embed:true + default + // before request execution; this test confirms the metadata helper itself is + // narrow-scoped and doesn't mutate fields outside its concern). + Assert.AreEqual(true, def.Required); + Assert.AreEqual("should-be-untouched-by-override", def.Default); + Assert.AreEqual(true, def.HasConfigDefault); + Assert.AreEqual("should-be-untouched-by-override", def.ConfigDefaultValue); + } + + /// + /// Multi-parameter scenario: in a real metadata-loading loop, the helper is + /// called once per parameter. Calling it on a mix of embed:true Byte[] params + /// (overridden) and embed:false params (untouched) confirms the helper has no + /// cross-call state. + /// + [TestMethod] + public void ApplyEmbedTypeOverride_MultipleParams_OnlyEmbedTrueOnesOverridden() + { + ParameterDefinition embedDef = new() + { + Name = "primary_query", + SystemType = typeof(byte[]) + }; + ParameterDefinition normalDef = new() + { + Name = "top_k", + SystemType = typeof(int) + }; + ParameterDefinition anotherEmbedDef = new() + { + Name = "secondary_query", + SystemType = typeof(byte[]) + }; + + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + embedDef, + new ParameterMetadata { Name = "primary_query", Embed = true }, + schemaName: "dbo", storedProcedureName: "sp_hybrid", parameterName: "primary_query"); + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + normalDef, + new ParameterMetadata { Name = "top_k", Embed = false }, + schemaName: "dbo", storedProcedureName: "sp_hybrid", parameterName: "top_k"); + MsSqlMetadataProvider.ApplyEmbedTypeOverride( + anotherEmbedDef, + new ParameterMetadata { Name = "secondary_query", Embed = true }, + schemaName: "dbo", storedProcedureName: "sp_hybrid", parameterName: "secondary_query"); + + // Both embed:true params overridden to String pipeline + Assert.AreEqual(typeof(string), embedDef.SystemType); + Assert.AreEqual(typeof(string), anotherEmbedDef.SystemType); + + // Non-embed param's type unchanged + Assert.AreEqual(typeof(int), normalDef.SystemType); + } + + #endregion } }