From 2cd42e8a24d0a9f35b4aa14dcf0728e128fa3da7 Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Wed, 22 Oct 2025 15:20:44 +0530 Subject: [PATCH 01/10] Add basic Ai Search knowledgebase implementation --- ballerina/azure_ai_search_knowledgebase.bal | 712 ++++++++++++++++++++ 1 file changed, 712 insertions(+) create mode 100644 ballerina/azure_ai_search_knowledgebase.bal diff --git a/ballerina/azure_ai_search_knowledgebase.bal b/ballerina/azure_ai_search_knowledgebase.bal new file mode 100644 index 0000000..0dea4de --- /dev/null +++ b/ballerina/azure_ai_search_knowledgebase.bal @@ -0,0 +1,712 @@ +// Copyright (c) 2025 WSO2 LLC (http://www.wso2.com). +// +// WSO2 LLC. licenses this file to you under the Apache License, +// Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import ballerina/ai; +import ballerina/log; +import ballerina/uuid; +import ballerinax/azure.ai.search as search; +import ballerinax/azure.ai.search.index; + +const CONTENT_FIELD_NAME = "content"; +const KEY_FIELD_NAME = "id"; +const API_VERSION = "2025-09-01"; +const API_KEY_HEADER_NAME = "api-key"; + +# Information about the analyzed index schema +type IndexSchemaInfo record { + # Name of the key field in the index + string keyFieldName; + # Names of vector fields that need embeddings + string[] vectorFieldNames; + # Names of content fields that are searchable + string[] contentFieldNames; + # Map of all fields in the index + map allFields; +}; + +# Configuration for the Azure AI Service Clients +public type ClientConfiguration record {| + # Connection configuration for the Azure AI search client that use for create search index + # This configuration is only required when the `index` parameter + # is provided as an `search:SearchIndex` (i.e., when the system will create the index). + search:ConnectionConfig searchClientConnectionConfig = {}; + # Connection configuration for the Azure AI index client that use for index operations + index:ConnectionConfig indexClientConnectionConfig = {}; +|}; + +# Represents the Azure Search Knowledge Base implementation. +# User should create the required `indexer`, `data source` and `index` beforehand using +# the util functions provided in this module. +# Currently search fields only supported with `id`, `content` and `type` field names. +public distinct isolated class AzureAiSearchKnowledgeBase { + *ai:KnowledgeBase; + + private final search:SearchIndex index; + private final search:Client serviceClient; + private final index:Client indexClient; + private final string apiVersion; + private final string apiKey; + private final boolean verbose; + private final ai:Chunker|ai:AUTO|ai:DISABLE chunker; + private final ai:EmbeddingProvider embeddingModel; + private final string contentFieldName; + private final string keyFieldName; + private final string[] vectorFieldNames; + private final map allFields; + + # Initializes a new `AzureAiSearchKnowledgeBase` instance. + # + # + serviceUrl - The service URL of the Azure AI Search instance + # + apiKey - The API key for authenticating with the Azure AI Search service + # + index - The name of an existing search index or a `search:SearchIndex` definition to create + # + embeddingModel - The embedding model to use for generating embeddings + # + chunker - The chunker to use for chunking documents before ingestion. Defaults to `ai:AUTO`. + # + verbose - Whether to enable verbose logging. Defaults to `false`. + # + apiVersion - The API version to use for requests. + # + clientConfigurations - Additional client configurations for Azure AI Search clients + # + contentFieldName - The name of the field in the index that contains the main content. Defaults to "content". + # + return - An instance of `AzureAiSearchKnowledgeBase` or an `ai:Error` if initialization fails + public isolated function init(string serviceUrl, string apiKey, string|search:SearchIndex index, ai:EmbeddingProvider embeddingModel, + ai:Chunker|ai:AUTO|ai:DISABLE chunker = ai:AUTO, boolean verbose = false, + string apiVersion = API_VERSION, string contentFieldName = CONTENT_FIELD_NAME, + *ClientConfiguration clientConfigurations) returns ai:Error? { + self.chunker = chunker; + self.embeddingModel = embeddingModel; + self.verbose = verbose; + self.contentFieldName = contentFieldName; + + // Initialize service client for management operations + search:ConnectionConfig searchClientConfig = clientConfigurations.searchClientConnectionConfig; + self.apiKey = apiKey; + self.apiVersion = apiVersion; + + search:Client|error serviceClient = new search:Client(serviceUrl, searchClientConfig); + if serviceClient is error { + return error ai:Error("Failed to initialize Azure AI Service Client", serviceClient); + } + + self.serviceClient = serviceClient; + + string indexName = index is string ? index : index.name; + if index is string { + // Verify that the index exists + search:SearchIndex|error searchIndex = self.serviceClient->indexesGet(indexName, { + [API_KEY_HEADER_NAME]: self.apiKey}, {api\-version: self.apiVersion}); + if searchIndex is error { + logIfVerboseEnable(self.verbose, string `Search index ${indexName} does not exist: ${searchIndex.message()}`); + return error ai:Error("Failed to verify existence of index", searchIndex); + } + + self.index = searchIndex.cloneReadOnly(); + logIfVerboseEnable(self.verbose, string `Search index ${indexName} exists. Details: ${searchIndex.toJsonString()}`); + } else { + logIfVerboseEnable(self.verbose, string `Attempting to create search index ${indexName}...`); + search:SearchIndex|error createdIndex = self.serviceClient->indexesCreateOrUpdate(indexName, { + [API_KEY_HEADER_NAME]: self.apiKey, Prefer: "return=representation"}, index, {api\-version: self.apiVersion}); + if createdIndex is error { + logIfVerboseEnable(self.verbose, string `Failed to create search index ${indexName}: ${createdIndex.message()}`); + return error ai:Error("Failed to create search index", createdIndex); + } + self.index = createdIndex.cloneReadOnly(); + logIfVerboseEnable(self.verbose, string `Search index ${indexName} created successfully.`); + } + + string indexServiceUrl = string `${serviceUrl}/indexes('${indexName}')`; + logIfVerboseEnable(self.verbose, string `Initializing Azure Index Client for index URL: ${indexServiceUrl}`); + index:Client|error indexClient = new (indexServiceUrl, clientConfigurations.indexClientConnectionConfig); + if indexClient is error { + logIfVerboseEnable(self.verbose, string `Failed to initialize Azure Index Client: ${indexClient.message()}`); + return error ai:Error("Failed to initialize Azure Index Client", indexClient); + } + self.indexClient = indexClient; + + lock { + IndexSchemaInfo schemaInfo = check analyzeIndexSchema(self.verbose, self.index, self.contentFieldName); + + self.keyFieldName = schemaInfo.keyFieldName; + self.vectorFieldNames = schemaInfo.vectorFieldNames.cloneReadOnly(); + self.allFields = schemaInfo.allFields.cloneReadOnly(); + } + } + + # Ingests documents into the Azure search knowledge base. + # + documents - The documents or chunks to ingest (single document, array of documents, or array of chunks) + # + return - An `ai:Error` if ingestion fails, otherwise `nil` + public isolated function ingest(ai:Chunk[]|ai:Document[]|ai:Document documents) returns ai:Error? { + lock { + ai:Chunk[]|ai:Error chunks = self.chunk(documents.clone()); + if chunks is ai:Error { + logIfVerboseEnable(self.verbose, string `Failed to chunk documents: ${chunks.message()}}`, chunks); + return error ai:Error("Failed to chunk documents before ingestion", chunks); + } + + ai:Embedding[]|error embeddings = self.embeddingModel->batchEmbed(chunks); + if embeddings is error { + logIfVerboseEnable(self.verbose, string `Failed to generate embeddings for documents: ${embeddings.message()}}`, embeddings); + return error ai:Error("Failed to generate embeddings for documents", embeddings); + } + logIfVerboseEnable(self.verbose, string `Generated embeddings for ${embeddings.length().toString()} chunks.`); + + index:IndexDocumentsResult|error uploadResult = self.uploadDocuments(self.indexClient, chunks, self.index, + embeddings, {[API_KEY_HEADER_NAME]: self.apiKey}, {api\-version: self.apiVersion}); + if uploadResult is error { + logIfVerboseEnable(self.verbose, string `Failed to upload documents to search index: ${uploadResult.message()}}`, uploadResult); + return error ai:Error("Failed to upload documents to search index", uploadResult); + } + + // Validate that all documents were successfully indexed + foreach index:IndexingResult result in uploadResult.value { + if !result.status { + return error ai:Error(string `Failed to index document with key ${result.'key}: ${result.errorMessage ?: "Unknown error"}`); + } + } + + return; + } + } + + # Retrieves relevant chunks for the given query using vector search. + # + # + query - The text query to search for + # + maxLimit - The maximum number of items to return + # + filters - Optional metadata filters to apply during retrieval + # + return - An array of matching chunks with similarity scores, or an `ai:Error` if retrieval fails + public isolated function retrieve(string query, int maxLimit = 10, ai:MetadataFilters? filters = ()) returns ai:QueryMatch[]|ai:Error { + if query is "" { + return error ai:Error("Query cannot be empty for retrieval"); + } + + if maxLimit != -1 && maxLimit <= 0 { + return error ai:Error("maxLimit must be a positive integer"); + } + + if maxLimit > int:SIGNED32_MAX_VALUE { + return error ai:Error(string `maxLimit exceeds maximum allowed value of ${int:SIGNED32_MAX_VALUE}`); + } + + lock { + ai:TextChunk queryChunk = {content: query, 'type: "text-chunk"}; + ai:Embedding queryEmbedding = check self.embeddingModel->embed(queryChunk); + + // Create vector search request using Azure AI Search's integrated vectorization + int vectorFieldLength = self.vectorFieldNames.length(); + index:VectorQuery[]? vectorQuery = (); + + if vectorFieldLength != 0 { + ai:Vector|ai:Error vectors = self.generateVector(queryEmbedding); + if vectors is ai:Error { + return vectors; + } + + vectorQuery = [ + { + kind: "vector", + k: maxLimit == -1 ? () : maxLimit, + fields: string:'join(",", ...self.vectorFieldNames), + "vector": vectors + } + ]; + } + + index:SearchRequest searchRequest = { + search: query, + 'select: "*", + vectorQueries: vectorQuery ?: [], + top: maxLimit == -1 ? () : maxLimit + }; + + // Apply metadata filters if provided + if filters is ai:MetadataFilters { + string? filterExpression = self.buildODataFilter(filters.cloneReadOnly()); + if filterExpression is string { + searchRequest.filter = filterExpression; + } + } + + // Execute search + index:SearchDocumentsResult|error searchResult = self.indexClient->documentsSearchPost( + searchRequest, + {[API_KEY_HEADER_NAME]: self.apiKey}, + api\-version = self.apiVersion + ); + + if searchResult is error { + logIfVerboseEnable(self.verbose, string `Failed to retrieve documents from Azure AI Search: ${searchResult.message()}}`, searchResult); + return error ai:Error("Failed to retrieve documents from Azure AI Search", searchResult); + } + + // Convert search results to QueryMatch array + ai:QueryMatch[] matches = []; + foreach index:SearchResult result in searchResult.value { + ai:Chunk chunk = { + 'type: "text-chunk", + content: self.getFieldValue(result, self.contentFieldName), + metadata: self.extractMetadata(result) + }; + + ai:QueryMatch queryMatch = { + chunk: chunk, + similarityScore: result.\@search\.score + }; + matches.push(queryMatch); + } + + return matches.cloneReadOnly(); + } + } + + # Deletes chunks that match the given metadata filters. + # + # + filters - The metadata filters used to identify which chunks to delete + # + return - An `ai:Error` if the deletion fails, otherwise `nil` + public isolated function deleteByFilter(ai:MetadataFilters filters) returns ai:Error? { + ai:MetadataFilters filtersCopy = filters.cloneReadOnly(); + // First, search for documents matching the filters + string? filterExpression = self.buildODataFilter(filtersCopy); + + index:SearchRequest searchRequest = { + filter: filterExpression, + 'select: self.keyFieldName + // TODO: Implement batching if large number of documents expected + }; + + index:SearchDocumentsResult|error searchResult = self.indexClient->documentsSearchPost( + searchRequest, + {[API_KEY_HEADER_NAME]: self.apiKey}, + api\-version = self.apiVersion + ); + + if searchResult is error { + logIfVerboseEnable(self.verbose, string `Failed to search for documents to delete: ${searchResult.message()}}`, searchResult); + return error ai:Error("Failed to search for documents to delete", searchResult); + } + + // Extract document IDs + string[] documentIds = []; + foreach index:SearchResult result in searchResult.value { + string? documentId = self.getFieldValue(result, self.keyFieldName); + if documentId is string { + documentIds.push(documentId); + } + } + + if documentIds.length() == 0 { + return; // No documents found matching the filters + } + + // Create delete actions + index:IndexAction[] deleteActions = []; + foreach string docId in documentIds { + index:IndexAction deleteAction = { + \@search\.action: "delete" + }; + // Set the key field for deletion + deleteAction[self.keyFieldName] = docId; + deleteActions.push(deleteAction); + } + + // Execute batch delete + index:IndexBatch deleteBatch = { + value: deleteActions + }; + + index:IndexDocumentsResult|error deleteResult = self.indexClient->documentsIndex( + deleteBatch, + {[API_KEY_HEADER_NAME]: self.apiKey}, + api\-version = self.apiVersion + ); + + if deleteResult is error { + return error ai:Error("Failed to delete documents from Azure AI Search", deleteResult); + } + + // Check for any failures in the delete operation + foreach index:IndexingResult result in deleteResult.value { + if !result.status { + return error ai:Error(string `Failed to delete document with key ${result.'key}: ${result.errorMessage ?: "Unknown error"}`); + } + } + + return; + } + + private isolated function buildODataFilter(ai:MetadataFilters filters) returns string? { + return self.convertFiltersToOData(filters); + } + + private isolated function convertFiltersToOData(ai:MetadataFilters|ai:MetadataFilter node) returns string? { + if node is ai:MetadataFilter { + return self.convertSingleFilterToOData(node); + } + + // Handle MetadataFilters with multiple filters + string[] filterExpressions = []; + foreach ai:MetadataFilters|ai:MetadataFilter child in node.filters { + string? childExpression = self.convertFiltersToOData(child); + if childExpression is string { + filterExpressions.push(childExpression); + } + } + + if filterExpressions.length() == 0 { + return (); + } + + if filterExpressions.length() == 1 { + return filterExpressions[0]; + } + + // Combine filters with the appropriate logical operator + string logicalOperator = node.condition == ai:AND ? " and " : " or "; + return string `(${string:'join(logicalOperator, ...filterExpressions)})`; + } + + private isolated function convertSingleFilterToOData(ai:MetadataFilter filter) returns string? { + string fieldName = filter.key; + json value = filter.value; + ai:MetadataFilterOperator operator = filter.operator; + + match operator { + ai:EQUAL => { + return self.buildEqualityFilter(fieldName, value); + } + ai:NOT_EQUAL => { + return self.buildInequalityFilter(fieldName, value); + } + ai:IN => { + return self.buildInFilter(fieldName, value); + } + ai:NOT_IN => { + return self.buildNotInFilter(fieldName, value); + } + ai:GREATER_THAN => { + return self.buildComparisonFilter(fieldName, value, "gt"); + } + ai:LESS_THAN => { + return self.buildComparisonFilter(fieldName, value, "lt"); + } + ai:GREATER_THAN_OR_EQUAL => { + return self.buildComparisonFilter(fieldName, value, "ge"); + } + ai:LESS_THAN_OR_EQUAL => { + return self.buildComparisonFilter(fieldName, value, "le"); + } + _ => { + return (); // Unsupported operator + } + } + } + + private isolated function buildEqualityFilter(string fieldName, json value) returns string? { + string? formattedValue = self.formatValueForOData(value); + if formattedValue is string { + return string `${fieldName} eq ${formattedValue}`; + } + return (); + } + + private isolated function buildInequalityFilter(string fieldName, json value) returns string? { + string? formattedValue = self.formatValueForOData(value); + if formattedValue is string { + return string `${fieldName} ne ${formattedValue}`; + } + return (); + } + + private isolated function buildInFilter(string fieldName, json value) returns string? { + if value is json[] && value.length() > 0 { + string[] conditions = []; + foreach json item in value { + string? formattedValue = self.formatValueForOData(item); + if formattedValue is string { + conditions.push(string `${fieldName} eq ${formattedValue}`); + } + } + if conditions.length() > 0 { + return "(" + string:'join(" or ", ...conditions) + ")"; + } + } + return (); + } + + private isolated function buildNotInFilter(string fieldName, json value) returns string? { + if value is json[] && value.length() > 0 { + string[] conditions = []; + foreach json item in value { + string? formattedValue = self.formatValueForOData(item); + if formattedValue is string { + conditions.push(string `${fieldName} ne ${formattedValue}`); + } + } + if conditions.length() > 0 { + return "(" + string:'join(" and ", ...conditions) + ")"; + } + } + return (); + } + + private isolated function buildComparisonFilter(string fieldName, json value, string odataOperator) returns string? { + string? formattedValue = self.formatValueForOData(value); + if formattedValue is string { + return string `${fieldName} ${odataOperator} ${formattedValue}`; + } + return (); + } + + private isolated function formatValueForOData(json value) returns string? { + if value is string { + // Escape single quotes in strings and wrap in single quotes + string escapedValue = re `'`.replaceAll(value, "''"); + return string `'${escapedValue}'`; + } else if value is int|decimal { + return value.toString(); + } else if value is boolean { + return value.toString(); + } + // For other types (like null), return null to indicate unsupported + return (); + } + + private isolated function getFieldValue(index:SearchResult result, string fieldName) returns string { + anydata fieldValue = result[fieldName]; + if fieldValue is string { + return fieldValue; + } + if fieldValue is () { + logIfVerboseEnable(self.verbose, string `Field ${fieldName} is null in search result.`); + return ""; + } + // Handle other types if they are possible content + return fieldValue.toString(); + } + + private isolated function extractMetadata(index:SearchResult result) returns ai:Metadata { + lock { + ai:Metadata metadata = {}; + + // Extract all fields except the core content/title fields as metadata + map clonedResult = result.cloneReadOnly(); + foreach string k in clonedResult.keys() { + anydata value = clonedResult[k]; + if k != self.contentFieldName && k != self.keyFieldName && self.vectorFieldNames.indexOf(k) == () && + k != "@search.score" && k != "@search.highlights" { + if value is json { + metadata[k] = value; + } + } + } + + return metadata.cloneReadOnly(); + } + } + + private isolated function chunk(ai:Document|ai:Document[]|ai:Chunk[] input) returns ai:Chunk[]|ai:Error { + (ai:Document|ai:Chunk)[] inputs = input is ai:Document[]|ai:Chunk[] ? input : [input]; + ai:Chunker|ai:AUTO|ai:DISABLE chunker = self.chunker; + if chunker is ai:DISABLE { + return inputs; + } + ai:Chunk[] chunks = []; + foreach ai:Document|ai:Chunk item in inputs { + ai:Chunker chunkerToUse = chunker is ai:Chunker ? chunker : guessChunker(item); + chunks.push(...check chunkerToUse.chunk(item)); + } + return chunks; + } + + private isolated function uploadDocuments( + index:Client 'client, + (ai:Document|ai:Chunk)[] documents, + search:SearchIndex index, + ai:Embedding[]? embeddings = (), + index:DocumentsIndexHeaders headers = {}, + index:DocumentsIndexQueries queries = {api\-version: API_VERSION} + ) returns index:IndexDocumentsResult|error { + if embeddings is ai:Embedding[] && embeddings.length() != documents.length() { + return error ai:Error("Embeddings count does not match documents count, Embeddings length: " + + string `${embeddings.length()}, Documents length: ${documents.length()}`); + } + + lock { + index:IndexAction[] indexActions = []; + (ai:Document|ai:Chunk)[] & readonly docs = documents.cloneReadOnly(); + ai:Embedding[]? embeddingValues = embeddings.cloneReadOnly(); + foreach int i in 0..documentsIndex(batch.cloneReadOnly(), headers.cloneReadOnly(), queries.cloneReadOnly()); + } + } + + private isolated function generateVector(ai:Embedding embedding) returns ai:Vector|ai:Error { + if embedding is ai:Vector { + return embedding; + } else if embedding is ai:HybridVector { + // Return the dense part, discard sparse + return embedding.dense; + } else { + // Explicitly fail for sparse-only embeddings + return error ai:Error("AzureAiSearchKnowledgeBase only supports dense or hybrid embeddings, but received a SparseVector."); + } + } +} + +isolated function logIfVerboseEnable(boolean verbose, string value, 'error? err = ()) { + if verbose { + log:printInfo(string `[AzureAiSearchKnowledgeBase] ${value}`); + if err is error { + log:printError(string `[AzureAiSearchKnowledgeBase] Error Details: ${err.message()}`, err); + } + } +} + +isolated function guessChunker(ai:Document|ai:Chunk doc) returns ai:Chunker { + // Guess the chunker based on the document type or mimeType in metadata + string? mimeType = doc.metadata?.mimeType; + if mimeType == "text/markdown" { + return new ai:MarkdownChunker(); + } + if mimeType == "text/html" { + return new ai:HtmlChunker(); + } + // Fallback to file name + string? fileName = doc.metadata?.fileName; + if fileName is string { + if fileName.endsWith(".md") { + return new ai:MarkdownChunker(); + } + if fileName.endsWith(".html") { + return new ai:HtmlChunker(); + } + } + return new ai:GenericRecursiveChunker(); +} + +isolated function analyzeIndexSchema(boolean verbose, search:SearchIndex index, string contentFieldName) returns IndexSchemaInfo|ai:Error { + string? keyFieldName = (); + string[] vectorFieldNames = []; + string[] contentFieldNames = []; + map allFields = {}; + + foreach search:SearchField indexField in index.fields { + allFields[indexField.name] = indexField; + + // Identify key field + if indexField.'key == true { + keyFieldName = indexField.name; + } + + // Identify vector fields (fields with dimensions and vector search profile) + if indexField?.dimensions is int && indexField?.vectorSearchProfile is string { + vectorFieldNames.push(indexField.name); + } + + // Identify potential content fields (searchable string fields) + if indexField.name == contentFieldName { + contentFieldNames.push(indexField.name); + } + } + + if vectorFieldNames.length() == 0 { + logIfVerboseEnable(verbose, "No vector fields found in index schema."); + } + + if contentFieldNames.length() == 0 { + return error(string `Index schema must contains a field named '${contentFieldName}'.`); + } + + if keyFieldName is () { + logIfVerboseEnable(verbose, string `No key field defined in index schema. Using default key field name as '${KEY_FIELD_NAME}'.`); + } + + if vectorFieldNames.length() > 1 { + logIfVerboseEnable(verbose, string `Multiple vector fields found in index schema: ${string:'join(", ", ...vectorFieldNames)}. Currently one vecotr field is prefered. So for now, there is more than one, all the vector fileds will share the same vectors.`); + } + + return { + keyFieldName: keyFieldName ?: KEY_FIELD_NAME, + vectorFieldNames: vectorFieldNames, + contentFieldNames: contentFieldNames, + allFields: allFields + }; +} From 291a240b57488d1f739d747747b4cfc26e942fd1 Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Wed, 22 Oct 2025 15:31:44 +0530 Subject: [PATCH 02/10] [Automated] Update the toml files --- ballerina/Ballerina.toml | 12 +++++++++ ballerina/Dependencies.toml | 52 +++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/ballerina/Ballerina.toml b/ballerina/Ballerina.toml index 0fe26e5..0bb7100 100644 --- a/ballerina/Ballerina.toml +++ b/ballerina/Ballerina.toml @@ -17,3 +17,15 @@ groupId = "io.ballerina.lib" artifactId = "ai.azure-native" version = "1.2.0" path = "../native/build/libs/ai.azure-native-1.2.0-SNAPSHOT.jar" + +[[dependency]] +org="ballerinax" +name="azure.ai.search" +version="1.0.0" +repository="local" + +[[dependency]] +org="ballerinax" +name="azure.ai.search.index" +version="1.0.0" +repository="local" diff --git a/ballerina/Dependencies.toml b/ballerina/Dependencies.toml index d47d55d..447bb9d 100644 --- a/ballerina/Dependencies.toml +++ b/ballerina/Dependencies.toml @@ -73,7 +73,7 @@ modules = [ [[package]] org = "ballerina" name = "crypto" -version = "2.9.1" +version = "2.9.2" dependencies = [ {org = "ballerina", name = "jballerina.java"}, {org = "ballerina", name = "time"} @@ -82,7 +82,7 @@ dependencies = [ [[package]] org = "ballerina" name = "data.jsondata" -version = "1.1.2" +version = "1.1.3" dependencies = [ {org = "ballerina", name = "jballerina.java"}, {org = "ballerina", name = "lang.object"} @@ -91,7 +91,7 @@ dependencies = [ [[package]] org = "ballerina" name = "data.xmldata" -version = "1.5.0" +version = "1.5.2" dependencies = [ {org = "ballerina", name = "jballerina.java"}, {org = "ballerina", name = "lang.object"} @@ -262,13 +262,16 @@ dependencies = [ [[package]] org = "ballerina" name = "log" -version = "2.13.0" +version = "2.14.0" dependencies = [ {org = "ballerina", name = "io"}, {org = "ballerina", name = "jballerina.java"}, {org = "ballerina", name = "lang.value"}, {org = "ballerina", name = "observe"} ] +modules = [ + {org = "ballerina", packageName = "log", moduleName = "log"} +] [[package]] org = "ballerina" @@ -278,7 +281,7 @@ version = "1.2.0" [[package]] org = "ballerina" name = "mcp" -version = "1.0.0" +version = "1.0.1" dependencies = [ {org = "ballerina", name = "http"}, {org = "ballerina", name = "jballerina.java"}, @@ -376,6 +379,9 @@ dependencies = [ {org = "ballerina", name = "lang.int"}, {org = "ballerina", name = "time"} ] +modules = [ + {org = "ballerina", packageName = "uuid", moduleName = "uuid"} +] [[package]] org = "ballerina" @@ -408,7 +414,11 @@ dependencies = [ {org = "ballerina", name = "http"}, {org = "ballerina", name = "jballerina.java"}, {org = "ballerina", name = "lang.array"}, + {org = "ballerina", name = "log"}, {org = "ballerina", name = "test"}, + {org = "ballerina", name = "uuid"}, + {org = "ballerinax", name = "azure.ai.search"}, + {org = "ballerinax", name = "azure.ai.search.index"}, {org = "ballerinax", name = "azure.openai.chat"}, {org = "ballerinax", name = "azure.openai.embeddings"} ] @@ -416,6 +426,38 @@ modules = [ {org = "ballerinax", packageName = "ai.azure", moduleName = "ai.azure"} ] +[[package]] +org = "ballerinax" +name = "azure.ai.search" +version = "1.0.0" +dependencies = [ + {org = "ballerina", name = "data.jsondata"}, + {org = "ballerina", name = "http"}, + {org = "ballerina", name = "log"}, + {org = "ballerina", name = "url"}, + {org = "ballerina", name = "uuid"}, + {org = "ballerinai", name = "observe"} +] +modules = [ + {org = "ballerinax", packageName = "azure.ai.search", moduleName = "azure.ai.search"} +] + +[[package]] +org = "ballerinax" +name = "azure.ai.search.index" +version = "1.0.0" +dependencies = [ + {org = "ballerina", name = "constraint"}, + {org = "ballerina", name = "data.jsondata"}, + {org = "ballerina", name = "http"}, + {org = "ballerina", name = "log"}, + {org = "ballerina", name = "url"}, + {org = "ballerinai", name = "observe"} +] +modules = [ + {org = "ballerinax", packageName = "azure.ai.search.index", moduleName = "azure.ai.search.index"} +] + [[package]] org = "ballerinax" name = "azure.openai.chat" From cf93a111d57f637791f7b4338c7a63d458be819b Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Wed, 22 Oct 2025 15:48:04 +0530 Subject: [PATCH 03/10] [Automated] Update the toml files --- ballerina/Dependencies.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ballerina/Dependencies.toml b/ballerina/Dependencies.toml index 447bb9d..77971fd 100644 --- a/ballerina/Dependencies.toml +++ b/ballerina/Dependencies.toml @@ -111,7 +111,7 @@ dependencies = [ [[package]] org = "ballerina" name = "http" -version = "2.14.6" +version = "2.14.7" dependencies = [ {org = "ballerina", name = "auth"}, {org = "ballerina", name = "cache"}, From a849984c82a175e536387301de2ffc546c97cc28 Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Thu, 23 Oct 2025 12:51:38 +0530 Subject: [PATCH 04/10] refactor the knowledgebase implementation --- ballerina/azure_ai_search_knowledgebase.bal | 533 ++++++++++++-------- 1 file changed, 332 insertions(+), 201 deletions(-) diff --git a/ballerina/azure_ai_search_knowledgebase.bal b/ballerina/azure_ai_search_knowledgebase.bal index 0dea4de..fa84980 100644 --- a/ballerina/azure_ai_search_knowledgebase.bal +++ b/ballerina/azure_ai_search_knowledgebase.bal @@ -22,9 +22,46 @@ import ballerinax/azure.ai.search.index; const CONTENT_FIELD_NAME = "content"; const KEY_FIELD_NAME = "id"; -const API_VERSION = "2025-09-01"; +const AI_AZURE_KNOWLEDGEBASE_API_VERSION = "2025-09-01"; const API_KEY_HEADER_NAME = "api-key"; +// Search action constants +const SEARCH_ACTION_MERGE_OR_UPLOAD = "mergeOrUpload"; +const SEARCH_ACTION_DELETE = "delete"; + +// Vector search constants +const VECTOR_QUERY_KIND = "vector"; + +// Content type constants +const CONTENT_TYPE_TEXT_CHUNK = "text-chunk"; +const MIME_TYPE_MARKDOWN = "text/markdown"; +const MIME_TYPE_HTML = "text/html"; + +// File extension constants +const FILE_EXT_MARKDOWN = ".md"; +const FILE_EXT_HTML = ".html"; + +// Search field constants +const SEARCH_SCORE_FIELD = "@search.score"; +const SEARCH_HIGHLIGHTS_FIELD = "@search.highlights"; +const SEARCH_ACTION_FIELD = "@search.action"; + +// OData operator constants +const ODATA_OPERATOR_GT = "gt"; +const ODATA_OPERATOR_LT = "lt"; +const ODATA_OPERATOR_GE = "ge"; +const ODATA_OPERATOR_LE = "le"; +const ODATA_OPERATOR_EQ = "eq"; +const ODATA_OPERATOR_NE = "ne"; +const ODATA_OPERATOR_AND = " and "; +const ODATA_OPERATOR_OR = " or "; + +// Preference header constants +const PREFER_HEADER_RETURN_REPRESENTATION = "return=representation"; + +// Default field names +const DEFAULT_TYPE_FIELD_NAME = "type"; + # Information about the analyzed index schema type IndexSchemaInfo record { # Name of the key field in the index @@ -38,7 +75,7 @@ type IndexSchemaInfo record { }; # Configuration for the Azure AI Service Clients -public type ClientConfiguration record {| +public type AzureAiSearchKnowledgeBaseClientConfiguration record {| # Connection configuration for the Azure AI search client that use for create search index # This configuration is only required when the `index` parameter # is provided as an `search:SearchIndex` (i.e., when the system will create the index). @@ -71,7 +108,8 @@ public distinct isolated class AzureAiSearchKnowledgeBase { # # + serviceUrl - The service URL of the Azure AI Search instance # + apiKey - The API key for authenticating with the Azure AI Search service - # + index - The name of an existing search index or a `search:SearchIndex` definition to create + # + index - The name of an existing search index or a `search:SearchIndex` definition to create, + # When creating a new index, ensure that it contains one key field of type string. # + embeddingModel - The embedding model to use for generating embeddings # + chunker - The chunker to use for chunking documents before ingestion. Defaults to `ai:AUTO`. # + verbose - Whether to enable verbose logging. Defaults to `false`. @@ -81,8 +119,8 @@ public distinct isolated class AzureAiSearchKnowledgeBase { # + return - An instance of `AzureAiSearchKnowledgeBase` or an `ai:Error` if initialization fails public isolated function init(string serviceUrl, string apiKey, string|search:SearchIndex index, ai:EmbeddingProvider embeddingModel, ai:Chunker|ai:AUTO|ai:DISABLE chunker = ai:AUTO, boolean verbose = false, - string apiVersion = API_VERSION, string contentFieldName = CONTENT_FIELD_NAME, - *ClientConfiguration clientConfigurations) returns ai:Error? { + string apiVersion = AI_AZURE_KNOWLEDGEBASE_API_VERSION, string contentFieldName = CONTENT_FIELD_NAME, + *AzureAiSearchKnowledgeBaseClientConfiguration clientConfigurations) returns ai:Error? { self.chunker = chunker; self.embeddingModel = embeddingModel; self.verbose = verbose; @@ -115,7 +153,7 @@ public distinct isolated class AzureAiSearchKnowledgeBase { } else { logIfVerboseEnable(self.verbose, string `Attempting to create search index ${indexName}...`); search:SearchIndex|error createdIndex = self.serviceClient->indexesCreateOrUpdate(indexName, { - [API_KEY_HEADER_NAME]: self.apiKey, Prefer: "return=representation"}, index, {api\-version: self.apiVersion}); + [API_KEY_HEADER_NAME]: self.apiKey, Prefer: PREFER_HEADER_RETURN_REPRESENTATION}, index, {api\-version: self.apiVersion}); if createdIndex is error { logIfVerboseEnable(self.verbose, string `Failed to create search index ${indexName}: ${createdIndex.message()}`); return error ai:Error("Failed to create search index", createdIndex); @@ -198,7 +236,7 @@ public distinct isolated class AzureAiSearchKnowledgeBase { } lock { - ai:TextChunk queryChunk = {content: query, 'type: "text-chunk"}; + ai:TextChunk queryChunk = {content: query, 'type: CONTENT_TYPE_TEXT_CHUNK}; ai:Embedding queryEmbedding = check self.embeddingModel->embed(queryChunk); // Create vector search request using Azure AI Search's integrated vectorization @@ -206,14 +244,14 @@ public distinct isolated class AzureAiSearchKnowledgeBase { index:VectorQuery[]? vectorQuery = (); if vectorFieldLength != 0 { - ai:Vector|ai:Error vectors = self.generateVector(queryEmbedding); + ai:Vector|ai:Error vectors = generateVectorFromEmbedding(queryEmbedding); if vectors is ai:Error { return vectors; } vectorQuery = [ { - kind: "vector", + kind: VECTOR_QUERY_KIND, k: maxLimit == -1 ? () : maxLimit, fields: string:'join(",", ...self.vectorFieldNames), "vector": vectors @@ -252,9 +290,9 @@ public distinct isolated class AzureAiSearchKnowledgeBase { ai:QueryMatch[] matches = []; foreach index:SearchResult result in searchResult.value { ai:Chunk chunk = { - 'type: "text-chunk", - content: self.getFieldValue(result, self.contentFieldName), - metadata: self.extractMetadata(result) + 'type: CONTENT_TYPE_TEXT_CHUNK, + content: extractFieldValue(result, self.contentFieldName, self.verbose), + metadata: extractMetadataFromResult(result, self.contentFieldName, self.keyFieldName, self.vectorFieldNames) }; ai:QueryMatch queryMatch = { @@ -297,7 +335,7 @@ public distinct isolated class AzureAiSearchKnowledgeBase { // Extract document IDs string[] documentIds = []; foreach index:SearchResult result in searchResult.value { - string? documentId = self.getFieldValue(result, self.keyFieldName); + string? documentId = extractFieldValue(result, self.keyFieldName, self.verbose); if documentId is string { documentIds.push(documentId); } @@ -311,7 +349,7 @@ public distinct isolated class AzureAiSearchKnowledgeBase { index:IndexAction[] deleteActions = []; foreach string docId in documentIds { index:IndexAction deleteAction = { - \@search\.action: "delete" + \@search\.action: SEARCH_ACTION_DELETE }; // Set the key field for deletion deleteAction[self.keyFieldName] = docId; @@ -370,7 +408,7 @@ public distinct isolated class AzureAiSearchKnowledgeBase { } // Combine filters with the appropriate logical operator - string logicalOperator = node.condition == ai:AND ? " and " : " or "; + string logicalOperator = node.condition == ai:AND ? ODATA_OPERATOR_AND : ODATA_OPERATOR_OR; return string `(${string:'join(logicalOperator, ...filterExpressions)})`; } @@ -381,28 +419,28 @@ public distinct isolated class AzureAiSearchKnowledgeBase { match operator { ai:EQUAL => { - return self.buildEqualityFilter(fieldName, value); + return buildEqualityFilter(fieldName, value); } ai:NOT_EQUAL => { - return self.buildInequalityFilter(fieldName, value); + return buildInequalityFilter(fieldName, value); } ai:IN => { - return self.buildInFilter(fieldName, value); + return buildInFilter(fieldName, value); } ai:NOT_IN => { - return self.buildNotInFilter(fieldName, value); + return buildNotInFilter(fieldName, value); } ai:GREATER_THAN => { - return self.buildComparisonFilter(fieldName, value, "gt"); + return buildComparisonFilter(fieldName, value, ODATA_OPERATOR_GT); } ai:LESS_THAN => { - return self.buildComparisonFilter(fieldName, value, "lt"); + return buildComparisonFilter(fieldName, value, ODATA_OPERATOR_LT); } ai:GREATER_THAN_OR_EQUAL => { - return self.buildComparisonFilter(fieldName, value, "ge"); + return buildComparisonFilter(fieldName, value, ODATA_OPERATOR_GE); } ai:LESS_THAN_OR_EQUAL => { - return self.buildComparisonFilter(fieldName, value, "le"); + return buildComparisonFilter(fieldName, value, ODATA_OPERATOR_LE); } _ => { return (); // Unsupported operator @@ -410,109 +448,6 @@ public distinct isolated class AzureAiSearchKnowledgeBase { } } - private isolated function buildEqualityFilter(string fieldName, json value) returns string? { - string? formattedValue = self.formatValueForOData(value); - if formattedValue is string { - return string `${fieldName} eq ${formattedValue}`; - } - return (); - } - - private isolated function buildInequalityFilter(string fieldName, json value) returns string? { - string? formattedValue = self.formatValueForOData(value); - if formattedValue is string { - return string `${fieldName} ne ${formattedValue}`; - } - return (); - } - - private isolated function buildInFilter(string fieldName, json value) returns string? { - if value is json[] && value.length() > 0 { - string[] conditions = []; - foreach json item in value { - string? formattedValue = self.formatValueForOData(item); - if formattedValue is string { - conditions.push(string `${fieldName} eq ${formattedValue}`); - } - } - if conditions.length() > 0 { - return "(" + string:'join(" or ", ...conditions) + ")"; - } - } - return (); - } - - private isolated function buildNotInFilter(string fieldName, json value) returns string? { - if value is json[] && value.length() > 0 { - string[] conditions = []; - foreach json item in value { - string? formattedValue = self.formatValueForOData(item); - if formattedValue is string { - conditions.push(string `${fieldName} ne ${formattedValue}`); - } - } - if conditions.length() > 0 { - return "(" + string:'join(" and ", ...conditions) + ")"; - } - } - return (); - } - - private isolated function buildComparisonFilter(string fieldName, json value, string odataOperator) returns string? { - string? formattedValue = self.formatValueForOData(value); - if formattedValue is string { - return string `${fieldName} ${odataOperator} ${formattedValue}`; - } - return (); - } - - private isolated function formatValueForOData(json value) returns string? { - if value is string { - // Escape single quotes in strings and wrap in single quotes - string escapedValue = re `'`.replaceAll(value, "''"); - return string `'${escapedValue}'`; - } else if value is int|decimal { - return value.toString(); - } else if value is boolean { - return value.toString(); - } - // For other types (like null), return null to indicate unsupported - return (); - } - - private isolated function getFieldValue(index:SearchResult result, string fieldName) returns string { - anydata fieldValue = result[fieldName]; - if fieldValue is string { - return fieldValue; - } - if fieldValue is () { - logIfVerboseEnable(self.verbose, string `Field ${fieldName} is null in search result.`); - return ""; - } - // Handle other types if they are possible content - return fieldValue.toString(); - } - - private isolated function extractMetadata(index:SearchResult result) returns ai:Metadata { - lock { - ai:Metadata metadata = {}; - - // Extract all fields except the core content/title fields as metadata - map clonedResult = result.cloneReadOnly(); - foreach string k in clonedResult.keys() { - anydata value = clonedResult[k]; - if k != self.contentFieldName && k != self.keyFieldName && self.vectorFieldNames.indexOf(k) == () && - k != "@search.score" && k != "@search.highlights" { - if value is json { - metadata[k] = value; - } - } - } - - return metadata.cloneReadOnly(); - } - } - private isolated function chunk(ai:Document|ai:Document[]|ai:Chunk[] input) returns ai:Chunk[]|ai:Error { (ai:Document|ai:Chunk)[] inputs = input is ai:Document[]|ai:Chunk[] ? input : [input]; ai:Chunker|ai:AUTO|ai:DISABLE chunker = self.chunker; @@ -533,7 +468,7 @@ public distinct isolated class AzureAiSearchKnowledgeBase { search:SearchIndex index, ai:Embedding[]? embeddings = (), index:DocumentsIndexHeaders headers = {}, - index:DocumentsIndexQueries queries = {api\-version: API_VERSION} + index:DocumentsIndexQueries queries = {api\-version: AI_AZURE_KNOWLEDGEBASE_API_VERSION} ) returns index:IndexDocumentsResult|error { if embeddings is ai:Embedding[] && embeddings.length() != documents.length() { return error ai:Error("Embeddings count does not match documents count, Embeddings length: " + @@ -546,69 +481,26 @@ public distinct isolated class AzureAiSearchKnowledgeBase { ai:Embedding[]? embeddingValues = embeddings.cloneReadOnly(); foreach int i in 0..documentsIndex(batch.cloneReadOnly(), headers.cloneReadOnly(), queries.cloneReadOnly()); } } - - private isolated function generateVector(ai:Embedding embedding) returns ai:Vector|ai:Error { - if embedding is ai:Vector { - return embedding; - } else if embedding is ai:HybridVector { - // Return the dense part, discard sparse - return embedding.dense; - } else { - // Explicitly fail for sparse-only embeddings - return error ai:Error("AzureAiSearchKnowledgeBase only supports dense or hybrid embeddings, but received a SparseVector."); - } - } } +# Logs informational or error messages if verbose mode is enabled +# +# + verbose - Whether verbose logging is enabled +# + value - The message to log +# + err - Optional error to log with additional details isolated function logIfVerboseEnable(boolean verbose, string value, 'error? err = ()) { if verbose { log:printInfo(string `[AzureAiSearchKnowledgeBase] ${value}`); @@ -640,28 +525,274 @@ isolated function logIfVerboseEnable(boolean verbose, string value, 'error? err } } +# Determines the appropriate chunker based on document metadata +# +# + doc - The document or chunk to determine chunker for +# + return - The appropriate chunker for the document type isolated function guessChunker(ai:Document|ai:Chunk doc) returns ai:Chunker { // Guess the chunker based on the document type or mimeType in metadata string? mimeType = doc.metadata?.mimeType; - if mimeType == "text/markdown" { + if mimeType == MIME_TYPE_MARKDOWN { return new ai:MarkdownChunker(); } - if mimeType == "text/html" { + if mimeType == MIME_TYPE_HTML { return new ai:HtmlChunker(); } // Fallback to file name string? fileName = doc.metadata?.fileName; if fileName is string { - if fileName.endsWith(".md") { + if fileName.endsWith(FILE_EXT_MARKDOWN) { return new ai:MarkdownChunker(); } - if fileName.endsWith(".html") { + if fileName.endsWith(FILE_EXT_HTML) { return new ai:HtmlChunker(); } } return new ai:GenericRecursiveChunker(); } +# Converts embeddings to vectors for Azure AI Search +# +# + embedding - The embedding to convert +# + return - The vector representation or an error if conversion fails +isolated function generateVectorFromEmbedding(ai:Embedding embedding) returns ai:Vector|ai:Error { + if embedding is ai:Vector { + return embedding; + } else if embedding is ai:HybridVector { + // Return the dense part, discard sparse + return embedding.dense; + } else { + // Explicitly fail for sparse-only embeddings + return error ai:Error("AzureAiSearchKnowledgeBase only supports dense or hybrid embeddings, but received a SparseVector."); + } +} + +# Formats a JSON value for use in OData expressions +# +# + value - The JSON value to format +# + return - The formatted string or null if type is unsupported +isolated function formatValueForOData(json value) returns string? { + if value is string { + // Escape single quotes in strings and wrap in single quotes + string escapedValue = re `'`.replaceAll(value, "''"); + return string `'${escapedValue}'`; + } else if value is int|decimal { + return value.toString(); + } else if value is boolean { + return value.toString(); + } + // For other types (like null), return null to indicate unsupported + return (); +} + +# Builds an equality filter for OData +# +# + fieldName - The field name to filter on +# + value - The value to compare +# + return - The formatted equality filter or null if value is unsupported +isolated function buildEqualityFilter(string fieldName, json value) returns string? { + string? formattedValue = formatValueForOData(value); + if formattedValue is string { + return string `${fieldName} ${ODATA_OPERATOR_EQ} ${formattedValue}`; + } + return (); +} + +# Builds an inequality filter for OData +# +# + fieldName - The field name to filter on +# + value - The value to compare +# + return - The formatted inequality filter or null if value is unsupported +isolated function buildInequalityFilter(string fieldName, json value) returns string? { + string? formattedValue = formatValueForOData(value); + if formattedValue is string { + return string `${fieldName} ${ODATA_OPERATOR_NE} ${formattedValue}`; + } + return (); +} + +# Builds an IN filter for OData +# +# + fieldName - The field name to filter on +# + value - The array of values to check membership +# + return - The formatted IN filter or null if values are invalid +isolated function buildInFilter(string fieldName, json value) returns string? { + if value is json[] && value.length() > 0 { + string[] conditions = []; + foreach json item in value { + string? formattedValue = formatValueForOData(item); + if formattedValue is string { + conditions.push(string `${fieldName} ${ODATA_OPERATOR_EQ} ${formattedValue}`); + } + } + if conditions.length() > 0 { + return "(" + string:'join(ODATA_OPERATOR_OR, ...conditions) + ")"; + } + } + return (); +} + +# Builds a NOT IN filter for OData +# +# + fieldName - The field name to filter on +# + value - The array of values to exclude +# + return - The formatted NOT IN filter or null if values are invalid +isolated function buildNotInFilter(string fieldName, json value) returns string? { + if value is json[] && value.length() > 0 { + string[] conditions = []; + foreach json item in value { + string? formattedValue = formatValueForOData(item); + if formattedValue is string { + conditions.push(string `${fieldName} ${ODATA_OPERATOR_NE} ${formattedValue}`); + } + } + if conditions.length() > 0 { + return "(" + string:'join(ODATA_OPERATOR_AND, ...conditions) + ")"; + } + } + return (); +} + +# Builds a comparison filter for OData +# +# + fieldName - The field name to filter on +# + value - The value to compare +# + odataOperator - The OData comparison operator to use +# + return - The formatted comparison filter or null if value is unsupported +isolated function buildComparisonFilter(string fieldName, json value, string odataOperator) returns string? { + string? formattedValue = formatValueForOData(value); + if formattedValue is string { + return string `${fieldName} ${odataOperator} ${formattedValue}`; + } + return (); +} + +# Extracts a field value from a search result +# +# + result - The search result to extract from +# + fieldName - The name of the field to extract +# + verbose - Whether verbose logging is enabled +# + return - The field value as a string +isolated function extractFieldValue(index:SearchResult result, string fieldName, boolean verbose) returns string { + anydata fieldValue = result[fieldName]; + if fieldValue is string { + return fieldValue; + } + if fieldValue is () { + logIfVerboseEnable(verbose, string `Field ${fieldName} is null in search result.`); + return ""; + } + // Handle other types if they are possible content + return fieldValue.toString(); +} + +# Extracts metadata from a search result, excluding core fields +# +# + result - The search result to extract metadata from +# + contentFieldName - The name of the content field to exclude +# + keyFieldName - The name of the key field to exclude +# + vectorFieldNames - Array of vector field names to exclude +# + return - The extracted metadata +isolated function extractMetadataFromResult(index:SearchResult result, string contentFieldName, string keyFieldName, string[] vectorFieldNames) returns ai:Metadata { + ai:Metadata metadata = {}; + + // Extract all fields except the core content/title fields as metadata + map clonedResult = result.cloneReadOnly(); + foreach string k in clonedResult.keys() { + anydata value = clonedResult[k]; + if k != contentFieldName && k != keyFieldName && vectorFieldNames.indexOf(k) == () && + k != SEARCH_SCORE_FIELD && k != SEARCH_HIGHLIGHTS_FIELD { + if value is json { + metadata[k] = value; + } + } + } + + return metadata.cloneReadOnly(); +} + +# Creates an index action for a document or chunk +# +# + doc - The document or chunk to create action for +# + embedding - Optional embedding for vector fields +# + documentIndex - Index of the document in the batch +# + keyFieldName - Name of the key field +# + contentFieldName - Name of the content field +# + vectorFieldNames - Array of vector field names +# + allFields - Map of all fields in the index schema +# + verbose - Whether verbose logging is enabled +# + return - The created index action or an error +isolated function createIndexAction( + ai:Document|ai:Chunk doc, + ai:Embedding? embedding, + int documentIndex, + string keyFieldName, + string contentFieldName, + string[] vectorFieldNames, + map allFields, + boolean verbose +) returns index:IndexAction|ai:Error { + // Start with the basic action structure + index:IndexAction indexAction = { + \@search\.action: SEARCH_ACTION_MERGE_OR_UPLOAD + }; + + // Set the key field with a UUID + // TODO: handle non-string key fields + ai:Metadata? metadata = doc.metadata; + string keyValue = metadata !is () && metadata.hasKey(keyFieldName) + ? doc.metadata[keyFieldName].toString() + documentIndex.toString() + : uuid:createType1AsString(); + + indexAction[keyFieldName] = keyValue; + logIfVerboseEnable( + verbose, string `Set key field ${keyFieldName} to value ${keyValue} for document index ${documentIndex}.`); + + // Add embeddings to vector fields if available + if embedding is ai:Embedding { + foreach string vectorFieldName in vectorFieldNames { + ai:Vector|ai:Error vectors = generateVectorFromEmbedding(embedding); + if vectors is ai:Error { + logIfVerboseEnable( + verbose, string `Failed to generate vector for document index ${documentIndex} and field ${vectorFieldName}: ${vectors.message()}`); + return vectors; + } + + indexAction[vectorFieldName] = vectors; + logIfVerboseEnable( + verbose, string `Added vector for document index ${documentIndex} to field ${vectorFieldName}.`); + } + } + + indexAction[contentFieldName] = doc.content; + logIfVerboseEnable( + verbose, string `Added content for document index ${documentIndex} to field ${contentFieldName}.`); + + // Add document type if there's a field for it (check if "type" field exists) + if allFields.hasKey(DEFAULT_TYPE_FIELD_NAME) { + indexAction[DEFAULT_TYPE_FIELD_NAME] = doc.'type; + } + + // Add metadata fields + if metadata is ai:Metadata { + foreach [string, json] [key, value] in metadata.entries() { + boolean isPossibleMetadata = key != keyFieldName && key != contentFieldName + && vectorFieldNames.indexOf(key) == (); + // Only add metadata if the field exists in the index schema + if allFields.hasKey(key) && isPossibleMetadata { + indexAction[key] = value; + } else { + if isPossibleMetadata { + logIfVerboseEnable( + verbose, string `Skipping field ${key} as it does not exist in index schema.`); + } + } + } + } + + return indexAction; +} + isolated function analyzeIndexSchema(boolean verbose, search:SearchIndex index, string contentFieldName) returns IndexSchemaInfo|ai:Error { string? keyFieldName = (); string[] vectorFieldNames = []; From e91825e0ec574c73bb5da2d9ebaeb26cc70f454a Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Thu, 23 Oct 2025 13:04:09 +0530 Subject: [PATCH 05/10] Remove local dependencies --- ballerina/Ballerina.toml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/ballerina/Ballerina.toml b/ballerina/Ballerina.toml index 0bb7100..0fe26e5 100644 --- a/ballerina/Ballerina.toml +++ b/ballerina/Ballerina.toml @@ -17,15 +17,3 @@ groupId = "io.ballerina.lib" artifactId = "ai.azure-native" version = "1.2.0" path = "../native/build/libs/ai.azure-native-1.2.0-SNAPSHOT.jar" - -[[dependency]] -org="ballerinax" -name="azure.ai.search" -version="1.0.0" -repository="local" - -[[dependency]] -org="ballerinax" -name="azure.ai.search.index" -version="1.0.0" -repository="local" From c53ceee5ac8e626c2ce2f70db7b2c933601afd9b Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Thu, 23 Oct 2025 17:05:46 +0530 Subject: [PATCH 06/10] [Automated] Update the toml files --- ballerina/Ballerina.toml | 12 ++++++++++++ ballerina/Dependencies.toml | 3 +-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ballerina/Ballerina.toml b/ballerina/Ballerina.toml index 0fe26e5..0bb7100 100644 --- a/ballerina/Ballerina.toml +++ b/ballerina/Ballerina.toml @@ -17,3 +17,15 @@ groupId = "io.ballerina.lib" artifactId = "ai.azure-native" version = "1.2.0" path = "../native/build/libs/ai.azure-native-1.2.0-SNAPSHOT.jar" + +[[dependency]] +org="ballerinax" +name="azure.ai.search" +version="1.0.0" +repository="local" + +[[dependency]] +org="ballerinax" +name="azure.ai.search.index" +version="1.0.0" +repository="local" diff --git a/ballerina/Dependencies.toml b/ballerina/Dependencies.toml index 77971fd..9d017b4 100644 --- a/ballerina/Dependencies.toml +++ b/ballerina/Dependencies.toml @@ -144,7 +144,7 @@ modules = [ [[package]] org = "ballerina" name = "io" -version = "1.8.0" +version = "1.8.1" dependencies = [ {org = "ballerina", name = "jballerina.java"}, {org = "ballerina", name = "lang.value"} @@ -435,7 +435,6 @@ dependencies = [ {org = "ballerina", name = "http"}, {org = "ballerina", name = "log"}, {org = "ballerina", name = "url"}, - {org = "ballerina", name = "uuid"}, {org = "ballerinai", name = "observe"} ] modules = [ From c44d2238c953ba524f565a0b3f9d2dd92a78c36d Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Thu, 23 Oct 2025 17:11:15 +0530 Subject: [PATCH 07/10] Update the names and config names --- ballerina/azure_ai_search_knowledgebase.bal | 16 ++++++++-------- build-config/resources/Ballerina.toml | 12 ++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/ballerina/azure_ai_search_knowledgebase.bal b/ballerina/azure_ai_search_knowledgebase.bal index fa84980..c986b3b 100644 --- a/ballerina/azure_ai_search_knowledgebase.bal +++ b/ballerina/azure_ai_search_knowledgebase.bal @@ -75,7 +75,7 @@ type IndexSchemaInfo record { }; # Configuration for the Azure AI Service Clients -public type AzureAiSearchKnowledgeBaseClientConfiguration record {| +public type AiSearchKnowledgeBaseClientConfiguration record {| # Connection configuration for the Azure AI search client that use for create search index # This configuration is only required when the `index` parameter # is provided as an `search:SearchIndex` (i.e., when the system will create the index). @@ -88,7 +88,7 @@ public type AzureAiSearchKnowledgeBaseClientConfiguration record {| # User should create the required `indexer`, `data source` and `index` beforehand using # the util functions provided in this module. # Currently search fields only supported with `id`, `content` and `type` field names. -public distinct isolated class AzureAiSearchKnowledgeBase { +public distinct isolated class AiSearchKnowledgeBase { *ai:KnowledgeBase; private final search:SearchIndex index; @@ -104,7 +104,7 @@ public distinct isolated class AzureAiSearchKnowledgeBase { private final string[] vectorFieldNames; private final map allFields; - # Initializes a new `AzureAiSearchKnowledgeBase` instance. + # Initializes a new `AiSearchKnowledgeBase` instance. # # + serviceUrl - The service URL of the Azure AI Search instance # + apiKey - The API key for authenticating with the Azure AI Search service @@ -116,11 +116,11 @@ public distinct isolated class AzureAiSearchKnowledgeBase { # + apiVersion - The API version to use for requests. # + clientConfigurations - Additional client configurations for Azure AI Search clients # + contentFieldName - The name of the field in the index that contains the main content. Defaults to "content". - # + return - An instance of `AzureAiSearchKnowledgeBase` or an `ai:Error` if initialization fails + # + return - An instance of `AiSearchKnowledgeBase` or an `ai:Error` if initialization fails public isolated function init(string serviceUrl, string apiKey, string|search:SearchIndex index, ai:EmbeddingProvider embeddingModel, ai:Chunker|ai:AUTO|ai:DISABLE chunker = ai:AUTO, boolean verbose = false, string apiVersion = AI_AZURE_KNOWLEDGEBASE_API_VERSION, string contentFieldName = CONTENT_FIELD_NAME, - *AzureAiSearchKnowledgeBaseClientConfiguration clientConfigurations) returns ai:Error? { + *AiSearchKnowledgeBaseClientConfiguration clientConfigurations) returns ai:Error? { self.chunker = chunker; self.embeddingModel = embeddingModel; self.verbose = verbose; @@ -518,9 +518,9 @@ public distinct isolated class AzureAiSearchKnowledgeBase { # + err - Optional error to log with additional details isolated function logIfVerboseEnable(boolean verbose, string value, 'error? err = ()) { if verbose { - log:printInfo(string `[AzureAiSearchKnowledgeBase] ${value}`); + log:printInfo(string `[AiSearchKnowledgeBase] ${value}`); if err is error { - log:printError(string `[AzureAiSearchKnowledgeBase] Error Details: ${err.message()}`, err); + log:printError(string `[AiSearchKnowledgeBase] Error Details: ${err.message()}`, err); } } } @@ -563,7 +563,7 @@ isolated function generateVectorFromEmbedding(ai:Embedding embedding) returns ai return embedding.dense; } else { // Explicitly fail for sparse-only embeddings - return error ai:Error("AzureAiSearchKnowledgeBase only supports dense or hybrid embeddings, but received a SparseVector."); + return error ai:Error("AiSearchKnowledgeBase only supports dense or hybrid embeddings, but received a SparseVector."); } } diff --git a/build-config/resources/Ballerina.toml b/build-config/resources/Ballerina.toml index 3d4f497..56a0388 100644 --- a/build-config/resources/Ballerina.toml +++ b/build-config/resources/Ballerina.toml @@ -17,3 +17,15 @@ groupId = "io.ballerina.lib" artifactId = "ai.azure-native" version = "@toml.version@" path = "../native/build/libs/ai.azure-native-@project.version@.jar" + +[[dependency]] +org="ballerinax" +name="azure.ai.search" +version="1.0.0" +repository="local" + +[[dependency]] +org="ballerinax" +name="azure.ai.search.index" +version="1.0.0" +repository="local" From 7044bed851ca0ba6b53aaee1d9a3cf29711f1168 Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Thu, 23 Oct 2025 17:18:57 +0530 Subject: [PATCH 08/10] Remove local dependencies --- build-config/resources/Ballerina.toml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/build-config/resources/Ballerina.toml b/build-config/resources/Ballerina.toml index 56a0388..3d4f497 100644 --- a/build-config/resources/Ballerina.toml +++ b/build-config/resources/Ballerina.toml @@ -17,15 +17,3 @@ groupId = "io.ballerina.lib" artifactId = "ai.azure-native" version = "@toml.version@" path = "../native/build/libs/ai.azure-native-@project.version@.jar" - -[[dependency]] -org="ballerinax" -name="azure.ai.search" -version="1.0.0" -repository="local" - -[[dependency]] -org="ballerinax" -name="azure.ai.search.index" -version="1.0.0" -repository="local" From 4e92e3cb478716891c9bd1109d91c73f0f23e7cd Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Thu, 23 Oct 2025 17:20:19 +0530 Subject: [PATCH 09/10] Remove local dependencies --- ballerina/Ballerina.toml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/ballerina/Ballerina.toml b/ballerina/Ballerina.toml index 0bb7100..0fe26e5 100644 --- a/ballerina/Ballerina.toml +++ b/ballerina/Ballerina.toml @@ -17,15 +17,3 @@ groupId = "io.ballerina.lib" artifactId = "ai.azure-native" version = "1.2.0" path = "../native/build/libs/ai.azure-native-1.2.0-SNAPSHOT.jar" - -[[dependency]] -org="ballerinax" -name="azure.ai.search" -version="1.0.0" -repository="local" - -[[dependency]] -org="ballerinax" -name="azure.ai.search.index" -version="1.0.0" -repository="local" From ccf8150780ac29e2e184ad6be2845a0163937b10 Mon Sep 17 00:00:00 2001 From: Sasindu Alahakoon Date: Thu, 23 Oct 2025 20:58:00 +0530 Subject: [PATCH 10/10] Update knowledgebase --- ballerina/azure_ai_search_knowledgebase.bal | 141 ++++++++++---------- 1 file changed, 72 insertions(+), 69 deletions(-) diff --git a/ballerina/azure_ai_search_knowledgebase.bal b/ballerina/azure_ai_search_knowledgebase.bal index c986b3b..29f2677 100644 --- a/ballerina/azure_ai_search_knowledgebase.bal +++ b/ballerina/azure_ai_search_knowledgebase.bal @@ -17,12 +17,12 @@ import ballerina/ai; import ballerina/log; import ballerina/uuid; -import ballerinax/azure.ai.search as search; +import ballerinax/azure.ai.search; import ballerinax/azure.ai.search.index; const CONTENT_FIELD_NAME = "content"; const KEY_FIELD_NAME = "id"; -const AI_AZURE_KNOWLEDGEBASE_API_VERSION = "2025-09-01"; +const AI_AZURE_KNOWLEDGE_BASE_API_VERSION = "2025-09-01"; const API_KEY_HEADER_NAME = "api-key"; // Search action constants @@ -74,20 +74,7 @@ type IndexSchemaInfo record { map allFields; }; -# Configuration for the Azure AI Service Clients -public type AiSearchKnowledgeBaseClientConfiguration record {| - # Connection configuration for the Azure AI search client that use for create search index - # This configuration is only required when the `index` parameter - # is provided as an `search:SearchIndex` (i.e., when the system will create the index). - search:ConnectionConfig searchClientConnectionConfig = {}; - # Connection configuration for the Azure AI index client that use for index operations - index:ConnectionConfig indexClientConnectionConfig = {}; -|}; - # Represents the Azure Search Knowledge Base implementation. -# User should create the required `indexer`, `data source` and `index` beforehand using -# the util functions provided in this module. -# Currently search fields only supported with `id`, `content` and `type` field names. public distinct isolated class AiSearchKnowledgeBase { *ai:KnowledgeBase; @@ -116,22 +103,27 @@ public distinct isolated class AiSearchKnowledgeBase { # + apiVersion - The API version to use for requests. # + clientConfigurations - Additional client configurations for Azure AI Search clients # + contentFieldName - The name of the field in the index that contains the main content. Defaults to "content". + # + searchClientConnectionConfig - Connection configuration for the Azure AI search client. + # This configuration is only required when the `index` parameter is + # provided as an `search:SearchIndex` + # + indexClientConnectionConfig - Connection configuration for the Azure AI index client. # + return - An instance of `AiSearchKnowledgeBase` or an `ai:Error` if initialization fails - public isolated function init(string serviceUrl, string apiKey, string|search:SearchIndex index, ai:EmbeddingProvider embeddingModel, + public isolated function init(string serviceUrl, string apiKey, + string|search:SearchIndex index, ai:EmbeddingProvider embeddingModel, ai:Chunker|ai:AUTO|ai:DISABLE chunker = ai:AUTO, boolean verbose = false, - string apiVersion = AI_AZURE_KNOWLEDGEBASE_API_VERSION, string contentFieldName = CONTENT_FIELD_NAME, - *AiSearchKnowledgeBaseClientConfiguration clientConfigurations) returns ai:Error? { + string apiVersion = AI_AZURE_KNOWLEDGE_BASE_API_VERSION, string contentFieldName = CONTENT_FIELD_NAME, + search:ConnectionConfig searchClientConnectionConfig = {}, + index:ConnectionConfig indexClientConnectionConfig = {}) returns ai:Error? { self.chunker = chunker; self.embeddingModel = embeddingModel; self.verbose = verbose; self.contentFieldName = contentFieldName; // Initialize service client for management operations - search:ConnectionConfig searchClientConfig = clientConfigurations.searchClientConnectionConfig; self.apiKey = apiKey; self.apiVersion = apiVersion; - search:Client|error serviceClient = new search:Client(serviceUrl, searchClientConfig); + search:Client|error serviceClient = new search:Client(serviceUrl, searchClientConnectionConfig); if serviceClient is error { return error ai:Error("Failed to initialize Azure AI Service Client", serviceClient); } @@ -144,29 +136,34 @@ public distinct isolated class AiSearchKnowledgeBase { search:SearchIndex|error searchIndex = self.serviceClient->indexesGet(indexName, { [API_KEY_HEADER_NAME]: self.apiKey}, {api\-version: self.apiVersion}); if searchIndex is error { - logIfVerboseEnable(self.verbose, string `Search index ${indexName} does not exist: ${searchIndex.message()}`); + logIfVerboseEnabled(self.verbose, + string `Search index ${indexName} does not exist: ${searchIndex.message()}`); return error ai:Error("Failed to verify existence of index", searchIndex); } self.index = searchIndex.cloneReadOnly(); - logIfVerboseEnable(self.verbose, string `Search index ${indexName} exists. Details: ${searchIndex.toJsonString()}`); + logIfVerboseEnabled(self.verbose, + string `Search index ${indexName} exists. Details: ${searchIndex.toJsonString()}`); } else { - logIfVerboseEnable(self.verbose, string `Attempting to create search index ${indexName}...`); + logIfVerboseEnabled(self.verbose, string `Attempting to create search index ${indexName}...`); search:SearchIndex|error createdIndex = self.serviceClient->indexesCreateOrUpdate(indexName, { - [API_KEY_HEADER_NAME]: self.apiKey, Prefer: PREFER_HEADER_RETURN_REPRESENTATION}, index, {api\-version: self.apiVersion}); + [API_KEY_HEADER_NAME]: self.apiKey, Prefer: PREFER_HEADER_RETURN_REPRESENTATION}, + index, {api\-version: self.apiVersion}); if createdIndex is error { - logIfVerboseEnable(self.verbose, string `Failed to create search index ${indexName}: ${createdIndex.message()}`); + logIfVerboseEnabled(self.verbose, + string `Failed to create search index ${indexName}: ${createdIndex.message()}`); return error ai:Error("Failed to create search index", createdIndex); } self.index = createdIndex.cloneReadOnly(); - logIfVerboseEnable(self.verbose, string `Search index ${indexName} created successfully.`); + logIfVerboseEnabled(self.verbose, string `Search index ${indexName} created successfully.`); } string indexServiceUrl = string `${serviceUrl}/indexes('${indexName}')`; - logIfVerboseEnable(self.verbose, string `Initializing Azure Index Client for index URL: ${indexServiceUrl}`); - index:Client|error indexClient = new (indexServiceUrl, clientConfigurations.indexClientConnectionConfig); + logIfVerboseEnabled(self.verbose, string `Initializing Azure Index Client for index URL: ${indexServiceUrl}`); + index:Client|error indexClient = new (indexServiceUrl, indexClientConnectionConfig); if indexClient is error { - logIfVerboseEnable(self.verbose, string `Failed to initialize Azure Index Client: ${indexClient.message()}`); + logIfVerboseEnabled(self.verbose, + string `Failed to initialize Azure Index Client: ${indexClient.message()}`); return error ai:Error("Failed to initialize Azure Index Client", indexClient); } self.indexClient = indexClient; @@ -187,28 +184,33 @@ public distinct isolated class AiSearchKnowledgeBase { lock { ai:Chunk[]|ai:Error chunks = self.chunk(documents.clone()); if chunks is ai:Error { - logIfVerboseEnable(self.verbose, string `Failed to chunk documents: ${chunks.message()}}`, chunks); + logIfVerboseEnabled(self.verbose, + string `Failed to chunk documents: ${chunks.message()}}`, chunks); return error ai:Error("Failed to chunk documents before ingestion", chunks); } ai:Embedding[]|error embeddings = self.embeddingModel->batchEmbed(chunks); if embeddings is error { - logIfVerboseEnable(self.verbose, string `Failed to generate embeddings for documents: ${embeddings.message()}}`, embeddings); + logIfVerboseEnabled(self.verbose, + string `Failed to generate embeddings for documents: ${embeddings.message()}}`, embeddings); return error ai:Error("Failed to generate embeddings for documents", embeddings); } - logIfVerboseEnable(self.verbose, string `Generated embeddings for ${embeddings.length().toString()} chunks.`); + logIfVerboseEnabled(self.verbose, + string `Generated embeddings for ${embeddings.length().toString()} chunks.`); index:IndexDocumentsResult|error uploadResult = self.uploadDocuments(self.indexClient, chunks, self.index, embeddings, {[API_KEY_HEADER_NAME]: self.apiKey}, {api\-version: self.apiVersion}); if uploadResult is error { - logIfVerboseEnable(self.verbose, string `Failed to upload documents to search index: ${uploadResult.message()}}`, uploadResult); + logIfVerboseEnabled(self.verbose, + string `Failed to upload documents to search index: ${uploadResult.message()}}`, uploadResult); return error ai:Error("Failed to upload documents to search index", uploadResult); } // Validate that all documents were successfully indexed foreach index:IndexingResult result in uploadResult.value { if !result.status { - return error ai:Error(string `Failed to index document with key ${result.'key}: ${result.errorMessage ?: "Unknown error"}`); + return error ai:Error( + string `Failed to index document with key ${result.'key}: ${result.errorMessage ?: "Unknown error"}`); } } @@ -222,11 +224,8 @@ public distinct isolated class AiSearchKnowledgeBase { # + maxLimit - The maximum number of items to return # + filters - Optional metadata filters to apply during retrieval # + return - An array of matching chunks with similarity scores, or an `ai:Error` if retrieval fails - public isolated function retrieve(string query, int maxLimit = 10, ai:MetadataFilters? filters = ()) returns ai:QueryMatch[]|ai:Error { - if query is "" { - return error ai:Error("Query cannot be empty for retrieval"); - } - + public isolated function retrieve(string query, int maxLimit = 10, + ai:MetadataFilters? filters = ()) returns ai:QueryMatch[]|ai:Error { if maxLimit != -1 && maxLimit <= 0 { return error ai:Error("maxLimit must be a positive integer"); } @@ -282,7 +281,8 @@ public distinct isolated class AiSearchKnowledgeBase { ); if searchResult is error { - logIfVerboseEnable(self.verbose, string `Failed to retrieve documents from Azure AI Search: ${searchResult.message()}}`, searchResult); + logIfVerboseEnabled(self.verbose, + string `Failed to retrieve documents from Azure AI Search: ${searchResult.message()}}`, searchResult); return error ai:Error("Failed to retrieve documents from Azure AI Search", searchResult); } @@ -328,18 +328,15 @@ public distinct isolated class AiSearchKnowledgeBase { ); if searchResult is error { - logIfVerboseEnable(self.verbose, string `Failed to search for documents to delete: ${searchResult.message()}}`, searchResult); + logIfVerboseEnabled(self.verbose, + string `Failed to search for documents to delete: ${searchResult.message()}}`, searchResult); return error ai:Error("Failed to search for documents to delete", searchResult); } - // Extract document IDs - string[] documentIds = []; - foreach index:SearchResult result in searchResult.value { - string? documentId = extractFieldValue(result, self.keyFieldName, self.verbose); - if documentId is string { - documentIds.push(documentId); - } - } + string[] documentIds = from index:SearchResult result in searchResult.value + let string? documentId = extractFieldValue(result, self.keyFieldName, self.verbose) + where documentId is string + select documentId; if documentIds.length() == 0 { return; // No documents found matching the filters @@ -374,7 +371,8 @@ public distinct isolated class AiSearchKnowledgeBase { // Check for any failures in the delete operation foreach index:IndexingResult result in deleteResult.value { if !result.status { - return error ai:Error(string `Failed to delete document with key ${result.'key}: ${result.errorMessage ?: "Unknown error"}`); + return error ai:Error(string + `Failed to delete document with key ${result.'key}: ${result.errorMessage ?: "Unknown error"}`); } } @@ -468,7 +466,7 @@ public distinct isolated class AiSearchKnowledgeBase { search:SearchIndex index, ai:Embedding[]? embeddings = (), index:DocumentsIndexHeaders headers = {}, - index:DocumentsIndexQueries queries = {api\-version: AI_AZURE_KNOWLEDGEBASE_API_VERSION} + index:DocumentsIndexQueries queries = {api\-version: AI_AZURE_KNOWLEDGE_BASE_API_VERSION} ) returns index:IndexDocumentsResult|error { if embeddings is ai:Embedding[] && embeddings.length() != documents.length() { return error ai:Error("Embeddings count does not match documents count, Embeddings length: " + @@ -505,7 +503,8 @@ public distinct isolated class AiSearchKnowledgeBase { value: indexActions }; - logIfVerboseEnable(self.verbose, string `Uploading ${indexActions.length().toString()} documents to Azure AI Search index ${index.name}.`); + logIfVerboseEnabled(self.verbose, string + `Uploading ${indexActions.length().toString()} documents to Azure AI Search index ${index.name}.`); return 'client->documentsIndex(batch.cloneReadOnly(), headers.cloneReadOnly(), queries.cloneReadOnly()); } } @@ -516,7 +515,7 @@ public distinct isolated class AiSearchKnowledgeBase { # + verbose - Whether verbose logging is enabled # + value - The message to log # + err - Optional error to log with additional details -isolated function logIfVerboseEnable(boolean verbose, string value, 'error? err = ()) { +isolated function logIfVerboseEnabled(boolean verbose, string value, 'error? err = ()) { if verbose { log:printInfo(string `[AiSearchKnowledgeBase] ${value}`); if err is error { @@ -558,13 +557,13 @@ isolated function guessChunker(ai:Document|ai:Chunk doc) returns ai:Chunker { isolated function generateVectorFromEmbedding(ai:Embedding embedding) returns ai:Vector|ai:Error { if embedding is ai:Vector { return embedding; - } else if embedding is ai:HybridVector { + } + if embedding is ai:HybridVector { // Return the dense part, discard sparse return embedding.dense; - } else { - // Explicitly fail for sparse-only embeddings - return error ai:Error("AiSearchKnowledgeBase only supports dense or hybrid embeddings, but received a SparseVector."); } + // Explicitly fail for sparse-only embeddings + return error("AiSearchKnowledgeBase only supports dense or hybrid embeddings, but received a SparseVector."); } # Formats a JSON value for use in OData expressions @@ -679,7 +678,7 @@ isolated function extractFieldValue(index:SearchResult result, string fieldName, return fieldValue; } if fieldValue is () { - logIfVerboseEnable(verbose, string `Field ${fieldName} is null in search result.`); + logIfVerboseEnabled(verbose, string `Field ${fieldName} is null in search result.`); return ""; } // Handle other types if they are possible content @@ -693,7 +692,8 @@ isolated function extractFieldValue(index:SearchResult result, string fieldName, # + keyFieldName - The name of the key field to exclude # + vectorFieldNames - Array of vector field names to exclude # + return - The extracted metadata -isolated function extractMetadataFromResult(index:SearchResult result, string contentFieldName, string keyFieldName, string[] vectorFieldNames) returns ai:Metadata { +isolated function extractMetadataFromResult(index:SearchResult result, string contentFieldName, + string keyFieldName, string[] vectorFieldNames) returns ai:Metadata { ai:Metadata metadata = {}; // Extract all fields except the core content/title fields as metadata @@ -745,7 +745,7 @@ isolated function createIndexAction( : uuid:createType1AsString(); indexAction[keyFieldName] = keyValue; - logIfVerboseEnable( + logIfVerboseEnabled( verbose, string `Set key field ${keyFieldName} to value ${keyValue} for document index ${documentIndex}.`); // Add embeddings to vector fields if available @@ -753,19 +753,20 @@ isolated function createIndexAction( foreach string vectorFieldName in vectorFieldNames { ai:Vector|ai:Error vectors = generateVectorFromEmbedding(embedding); if vectors is ai:Error { - logIfVerboseEnable( - verbose, string `Failed to generate vector for document index ${documentIndex} and field ${vectorFieldName}: ${vectors.message()}`); + logIfVerboseEnabled( + verbose, string + `Failed to generate vector for document index ${documentIndex} and field ${vectorFieldName}: ${vectors.message()}`); return vectors; } indexAction[vectorFieldName] = vectors; - logIfVerboseEnable( + logIfVerboseEnabled( verbose, string `Added vector for document index ${documentIndex} to field ${vectorFieldName}.`); } } indexAction[contentFieldName] = doc.content; - logIfVerboseEnable( + logIfVerboseEnabled( verbose, string `Added content for document index ${documentIndex} to field ${contentFieldName}.`); // Add document type if there's a field for it (check if "type" field exists) @@ -783,7 +784,7 @@ isolated function createIndexAction( indexAction[key] = value; } else { if isPossibleMetadata { - logIfVerboseEnable( + logIfVerboseEnabled( verbose, string `Skipping field ${key} as it does not exist in index schema.`); } } @@ -793,7 +794,8 @@ isolated function createIndexAction( return indexAction; } -isolated function analyzeIndexSchema(boolean verbose, search:SearchIndex index, string contentFieldName) returns IndexSchemaInfo|ai:Error { +isolated function analyzeIndexSchema( + boolean verbose, search:SearchIndex index, string contentFieldName) returns IndexSchemaInfo|ai:Error { string? keyFieldName = (); string[] vectorFieldNames = []; string[] contentFieldNames = []; @@ -819,7 +821,7 @@ isolated function analyzeIndexSchema(boolean verbose, search:SearchIndex index, } if vectorFieldNames.length() == 0 { - logIfVerboseEnable(verbose, "No vector fields found in index schema."); + logIfVerboseEnabled(verbose, "No vector fields found in index schema."); } if contentFieldNames.length() == 0 { @@ -827,11 +829,12 @@ isolated function analyzeIndexSchema(boolean verbose, search:SearchIndex index, } if keyFieldName is () { - logIfVerboseEnable(verbose, string `No key field defined in index schema. Using default key field name as '${KEY_FIELD_NAME}'.`); + logIfVerboseEnabled(verbose, string `No key field defined in index schema. Using default key field name as '${KEY_FIELD_NAME}'.`); } if vectorFieldNames.length() > 1 { - logIfVerboseEnable(verbose, string `Multiple vector fields found in index schema: ${string:'join(", ", ...vectorFieldNames)}. Currently one vecotr field is prefered. So for now, there is more than one, all the vector fileds will share the same vectors.`); + logIfVerboseEnabled(verbose, string + `Multiple vector fields found in index schema: ${string:'join(", ", ...vectorFieldNames)}. Currently one vecotr field is prefered. So for now, there is more than one, all the vector fileds will share the same vectors.`); } return {