wso2 · xlight05 · Feb 19, 2026 · Feb 18, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -700,7 +700,8 @@ export enum TaskStatus {
 export enum TaskTypes {
     SERVICE_DESIGN = "service_design",
     CONNECTIONS_INIT = "connections_init",
-    IMPLEMENTATION = "implementation"
+    IMPLEMENTATION = "implementation",
+    TESTING = "testing"
 }
 
 /**

@@ -21,6 +21,7 @@ import { TASK_WRITE_TOOL_NAME } from "./tools/task-writer";
 import { FILE_BATCH_EDIT_TOOL_NAME, FILE_SINGLE_EDIT_TOOL_NAME, FILE_WRITE_TOOL_NAME } from "./tools/text-editor";
 import { CONNECTOR_GENERATOR_TOOL } from "./tools/connector-generator";
 import { CONFIG_COLLECTOR_TOOL } from "./tools/config-collector";
+import { TEST_RUNNER_TOOL_NAME } from "./tools/test-runner";
 import { getLanglibInstructions } from "../utils/libs/langlibs";
 import { formatCodebaseStructure, formatCodeContext } from "./utils";
 import { GenerateAgentCodeRequest, OperationType, ProjectSource } from "@wso2/ballerina-core";
@@ -72,11 +73,15 @@ This plan will be visible to the user and the execution will be guided on the ta
 - This step should only contain the Client initialization.
 3. 'implementation'
 - for all the other implementations. Have resource function implementations in its own task.
+4. 'testing'
+- Responsible for writing test cases that cover the core logic of the implementation.
+- Always include this task after all implementation tasks, unless the user has explicitly said they do not want tests.
 
 #### Task Breakdown Example
 1. Create the HTTP service contract
 2. Create the MYSQL Connection
 3. Implement the resource functions
+4. Write test cases
 
 **Critical Rules**:
 - Task management is MANDATORY for all implementations
@@ -97,7 +102,8 @@ This plan will be visible to the user and the execution will be guided on the ta
      - First use ${LIBRARY_SEARCH_TOOL} with relevant keywords to discover available libraries
      - Then use ${LIBRARY_GET_TOOL} to fetch full details for the discovered libraries
      - If NO suitable library is found, call ${CONNECTOR_GENERATOR_TOOL} to generate connector from OpenAPI spec
-   - Before marking the task as completed, use the ${DIAGNOSTICS_TOOL_NAME} tool to check for compilation errors and fix them. Introduce a a new subtask if needed to fix errors.
+   - Before marking the task as completed, use ${DIAGNOSTICS_TOOL_NAME} to check for compilation errors and fix them. Introduce a new subtask if needed.
+   - Once compilation is clean and the project contains test cases, run the tests.
-   - Before marking the task as completed, use ${DIAGNOSTICS_TOOL_NAME} to check for compilation errors and fix them. Introduce a new subtask if needed.
-   - Once compilation is clean and the project contains test cases, run the tests.
+   - Before marking the task as completed, use ${DIAGNOSTICS_TOOL_NAME} to check for compilation errors and fix them. Introduce a new subtask if needed.
+   - Once compilation is clean and the current task is a 'testing' task, run the tests.
-   - Before marking the task as completed, use ${DIAGNOSTICS_TOOL_NAME} to check for compilation errors and fix them. Introduce a new subtask if needed.
-   - Once compilation is clean and the project contains test cases, run the tests.
+   - Before marking the task as completed, use ${DIAGNOSTICS_TOOL_NAME} to check for compilation errors and fix them. Introduce a new subtask if needed.
+   - Once compilation is clean and the current task is a 'testing' task, run the tests.
    - Mark task as completed using ${TASK_WRITE_TOOL_NAME} (send ALL tasks)
    - The tool will wait for TASK COMPLETION APPROVAL from the user
    - Once approved (success: true), immediately start the next task
@@ -110,6 +116,13 @@ This plan will be visible to the user and the execution will be guided on the ta
 - Keep language simple and non-technical when responding
 - No need to add manual progress indicators - the task list shows what you're working on
 
+## Test Runner
+When running tests, follow these steps:
+1. Before running, briefly tell the user what is being tested.
+2. Use ${TEST_RUNNER_TOOL_NAME} to run the test suite.
+3. After the run, give a short summary: how many tests passed/failed.
+4. If there are failures, mention which tests failed and why (one line each), fix them, and re-run.
-## Test Runner
-When running tests, follow these steps:
-1. Before running, briefly tell the user what is being tested.
-2. Use ${TEST_RUNNER_TOOL_NAME} to run the test suite.
-3. After the run, give a short summary: how many tests passed/failed.
-4. If there are failures, mention which tests failed and why (one line each), fix them, and re-run.
+## Test Runner
+When running tests, follow these steps:
+1. Before running, briefly tell the user what is being tested.
+2. Use ${TEST_RUNNER_TOOL_NAME} to run the test suite.
+3. After the run, give a short summary: how many tests passed/failed.
+4. If there are failures, mention which tests failed and why (one line each), fix them, and re-run.
+   Repeat at most 3 times. If tests still fail after 3 attempts, report the remaining failures to the user and stop.
-## Test Runner
-When running tests, follow these steps:
-1. Before running, briefly tell the user what is being tested.
-2. Use ${TEST_RUNNER_TOOL_NAME} to run the test suite.
-3. After the run, give a short summary: how many tests passed/failed.
-4. If there are failures, mention which tests failed and why (one line each), fix them, and re-run.
+## Test Runner
+When running tests, follow these steps:
+1. Before running, briefly tell the user what is being tested.
+2. Use ${TEST_RUNNER_TOOL_NAME} to run the test suite.
+3. After the run, give a short summary: how many tests passed/failed.
+4. If there are failures, mention which tests failed and why (one line each), fix them, and re-run.
+   Repeat at most 3 times. If tests still fail after 3 attempts, report the remaining failures to the user and stop.
+
 ## Edit Mode
 In the <system-reminder> tags, you will see if Edit mode is enabled. When its enabled, you must follow the below instructions strictly.
 
@@ -123,9 +136,9 @@ Identify the libraries required to implement the user requirement. Use ${LIBRARY
 Write/modify the Ballerina code to implement the user requirement. Use the ${FILE_BATCH_EDIT_TOOL_NAME}, ${FILE_SINGLE_EDIT_TOOL_NAME}, ${FILE_WRITE_TOOL_NAME} tools to write/modify the code. 
 
 ### Step 4: Validate the code
-Once the task is done, Always use ${DIAGNOSTICS_TOOL_NAME} tool to check for compilation errors and fix them. 
-You can use this tool multiple times after making changes to ensure there are no compilation errors.
-If you think you can't fix the error after multiple attempts, make sure to keep bring the code into a good state and finish off the task.
+Once the code is written, always use ${DIAGNOSTICS_TOOL_NAME} to check for compilation errors and fix them. You may call it multiple times after making changes.
+If errors cannot be resolved after multiple attempts, bring the code to a good state and finish the task.
+Once compilation is clean and the project contains test cases, run the tests.
 
 ### Step 5: Provide a consise summary
 Once the code is written and validated, provide a very concise summary of the overall changes made. Avoid adding detailed explanations and NEVER create documentations files via ${FILE_WRITE_TOOL_NAME}.
@@ -144,7 +157,7 @@ When generating Ballerina code strictly follow these syntax and structure guidel
 - In the library API documentation, if the service type is specified as generic, adhere to the instructions specified there on writing the service.
 - For GraphQL service related queries, if the user hasn't specified their own GraphQL Schema, write the proposed GraphQL schema for the user query right after the explanation before generating the Ballerina code. Use the same names as the GraphQL Schema when defining record types.
 - Some libaries has instructions field in their API documentation. Follow those instructions strictly when using those libraries.
-- You should only generate tests if the user explicitly asks for them in the query. You must use the 'ballerina/test' and whatever services associated when writing tests. Respect the instructions field in ballerina/test library and testGenerationInstruction field in whatever library associated with the service in the library API documentation when writing tests.
+- When writing tests, use the 'ballerina/test' module and any service-specific test libraries. Respect the instructions field in ballerina/test library and the testGenerationInstruction field in the associated service library API documentation when writing tests.
 
 ${getLanglibInstructions()}
 

@@ -41,6 +41,7 @@ import { getHealthcareLibraryProviderTool, HEALTHCARE_LIBRARY_PROVIDER_TOOL } fr
 import { createConnectorGeneratorTool, CONNECTOR_GENERATOR_TOOL } from './tools/connector-generator';
 import { LIBRARY_SEARCH_TOOL, getLibrarySearchTool } from './tools/library-search';
 import { createConfigCollectorTool, CONFIG_COLLECTOR_TOOL } from './tools/config-collector';
+import { createTestRunnerTool, TEST_RUNNER_TOOL_NAME } from './tools/test-runner';
 
 export interface ToolRegistryOptions {
     eventHandler: CopilotEventHandler;
@@ -101,5 +102,6 @@ export function createToolRegistry(opts: ToolRegistryOptions) {
             createReadExecute(eventHandler, tempProjectPath)
         ),
         [DIAGNOSTICS_TOOL_NAME]: createDiagnosticsTool(tempProjectPath, eventHandler),
+        [TEST_RUNNER_TOOL_NAME]: createTestRunnerTool(tempProjectPath, eventHandler),
     };
 }
@@ -36,7 +36,7 @@ export interface TaskWriteResult {
 export const TaskInputSchema = z.object({
     description: z.string().min(1).describe("Clear, actionable description of the task to be implemented"),
     status: z.enum([TaskStatus.PENDING, TaskStatus.IN_PROGRESS, TaskStatus.COMPLETED]).describe("Current status of the task. Use 'pending' for tasks not started, 'in_progress' when actively working on it, 'completed' when work is finished."),
-    type: z.enum([TaskTypes.SERVICE_DESIGN, TaskTypes.CONNECTIONS_INIT, TaskTypes.IMPLEMENTATION]).describe("Type of the implementation task. service_design will only generate the http service contract. not the implementation. connections_init will only generate the connection initializations. All of the other tasks will be of type implementation.")
+    type: z.enum([TaskTypes.SERVICE_DESIGN, TaskTypes.CONNECTIONS_INIT, TaskTypes.IMPLEMENTATION, TaskTypes.TESTING]).describe("Type of the implementation task. service_design: creates the HTTP service contract only (no implementation). connections_init: creates connection/client initializations only. implementation: all other implementation tasks. testing: writing test cases for the implemented logic — always the last task unless user opts out.")
 });
 
 const TaskWriteInputSchema = z.object({
@@ -92,7 +92,8 @@ Example:
 [
   {"description": "Create the HTTP service contract", "status": "pending", "type": "service_design"},
   {"description": "Create the MYSQL Connection", "status": "pending", "type": "connections_init"},
-  {"description": "Implement the resource functions", "status": "pending", "type": "implementation"}
+  {"description": "Implement the resource functions", "status": "pending", "type": "implementation"},
+  {"description": "Write test cases", "status": "pending", "type": "testing"}
 ]
 
 ## UPDATING TASKS (Every Other Call):

@@ -0,0 +1,101 @@
+// Copyright (c) 2026, WSO2 LLC. (https://www.wso2.com/) All Rights Reserved.
+
+// WSO2 LLC. licenses this file to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file except
+// in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import { tool } from 'ai';
+import { z } from 'zod';
+import child_process from 'child_process';
+import { CopilotEventHandler } from '../../utils/events';
+import { extension } from '../../../../BalExtensionContext';
+import { DIAGNOSTICS_TOOL_NAME } from './diagnostics';
+
+export const TEST_RUNNER_TOOL_NAME = "runTests";
+
+export interface TestRunResult {
+    output: string;
+}
+
+const TestRunnerInputSchema = z.object({});
+
+/**
+ * Creates the test runner tool for the AI agent.
+ *
+ * Executes `bal test` in the temp project directory and returns the full output
+ * so the agent can diagnose failures and fix them before completing a task.
+ *
+ * @param tempProjectPath - Path to the temporary project directory (agent's working dir)
+ * @param eventHandler - Event handler to emit tool execution events to the visualizer
+ * @returns Tool instance for running the Ballerina test suite
+ */
+export function createTestRunnerTool(
+    tempProjectPath: string,
+    eventHandler: CopilotEventHandler
+) {
+    return tool({
+        description: `Runs \`bal test\` in the current Ballerina project and returns the raw output.
+
+**Prerequisites:** The project must compile cleanly. Always run \`${DIAGNOSTICS_TOOL_NAME}\` first and resolve all compilation errors before invoking this tool — tests cannot run on code that does not compile.
+
+**REQUIRED before calling this tool:** You MUST tell the user what is being tested (e.g. which functions or scenarios the test cases cover). Do NOT invoke this tool without first informing the user.
+
+**When to use:**
+- After compilation is clean and the project contains test cases
+- After modifying existing code, to confirm tests still pass
+- After writing new test cases, to validate them
+
+**Output:** Returns the full raw \`bal test\` output. Read the output carefully to identify which tests passed or failed, then fix any failures before marking the task as complete.
+`,
+        inputSchema: TestRunnerInputSchema,
+        execute: async (_input: Record<string, never>, context?: { toolCallId?: string }): Promise<TestRunResult> => {
+            const toolCallId = context?.toolCallId || `fallback-${Date.now()}`;
+
+            eventHandler({
+                type: "tool_call",
+                toolName: TEST_RUNNER_TOOL_NAME,
+                toolCallId,
+            });
+
+            const result = await runBallerinaTests(tempProjectPath);
+
+            eventHandler({
+                type: "tool_result",
+                toolName: TEST_RUNNER_TOOL_NAME,
+                toolCallId,
+                toolOutput: result
+            });
+
+            return result;
+        }
+    });
+}
+
+/**
+ * Executes `bal test` in the given directory and parses the output.
+ */
+async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
+    return new Promise((resolve) => {
+        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
+        const command = `${balCmd} test`;
+
+        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
+
+        child_process.exec(command, { cwd }, (err, stdout, stderr) => {
+            const output = [stdout, stderr].filter(Boolean).join('\n').trim();
+
+            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}`);
+            resolve({ output });
+        });
+    });
-async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
-    return new Promise((resolve) => {
-        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
-        const command = `${balCmd} test`;
-
-        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
-
-        child_process.exec(command, { cwd }, (err, stdout, stderr) => {
-            const output = [stdout, stderr].filter(Boolean).join('\n').trim();
-
-            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}`);
-            resolve({ output });
-        });
-    });
+async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
+    return new Promise((resolve) => {
+        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
+        const command = `${balCmd} test`;
+        const TIMEOUT_MS = 5 * 60 * 1000;  // 5 minutes
+        const MAX_BUFFER  = 10 * 1024 * 1024; // 10 MB
+
+        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
+
+        child_process.exec(command, { cwd, timeout: TIMEOUT_MS, maxBuffer: MAX_BUFFER }, (err, stdout, stderr) => {
+            const parts = [stdout, stderr].filter(Boolean);
+            if (err) {
+                if (err.killed) {
+                    parts.push(`\nError: 'bal test' timed out after ${TIMEOUT_MS / 1000} seconds.`);
+                } else if (!stdout && !stderr) {
+                    // OS-level failure (e.g. command not found, maxBuffer exceeded)
+                    parts.push(`\nError: ${err.message}`);
+                }
+            }
+            const output = parts.join('\n').trim();
+            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}, killed: ${err?.killed ?? false}`);
+            resolve({ output });
+        });
+    });
+}
-async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
-    return new Promise((resolve) => {
-        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
-        const command = `${balCmd} test`;
-
-        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
-
-        child_process.exec(command, { cwd }, (err, stdout, stderr) => {
-            const output = [stdout, stderr].filter(Boolean).join('\n').trim();
-
-            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}`);
-            resolve({ output });
-        });
-    });
+async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
+    return new Promise((resolve) => {
+        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
+        const command = `${balCmd} test`;
+        const TIMEOUT_MS = 5 * 60 * 1000; // 5 minutes
+        const MAX_BUFFER = 10 * 1024 * 1024; // 10 MB
+
+        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
+
+        child_process.exec(command, { cwd, timeout: TIMEOUT_MS, maxBuffer: MAX_BUFFER }, (err, stdout, stderr) => {
+            const parts = [stdout, stderr].filter(Boolean);
+            if (err) {
+                // Surface OS-level errors (timeout, maxBuffer, command-not-found) that
+                // would otherwise be invisible to the agent.
+                if (err.killed) {
+                    parts.push(`\nError: 'bal test' timed out after ${TIMEOUT_MS / 1000} seconds.`);
+                } else if (!stdout && !stderr) {
+                    parts.push(`\nError: ${err.message}`);
+                }
+            }
+            const output = parts.join('\n').trim();
+
+            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}, killed: ${err?.killed ?? false}`);
+            resolve({ output });
+        });
+    });
+}
-async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
-    return new Promise((resolve) => {
-        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
-        const command = `${balCmd} test`;
-
-        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
-
-        child_process.exec(command, { cwd }, (err, stdout, stderr) => {
-            const output = [stdout, stderr].filter(Boolean).join('\n').trim();
-
-            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}`);
-            resolve({ output });
-        });
-    });
+async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
+    return new Promise((resolve) => {
+        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
+        const command = `${balCmd} test`;
+        const TIMEOUT_MS = 5 * 60 * 1000;  // 5 minutes
+        const MAX_BUFFER  = 10 * 1024 * 1024; // 10 MB
+
+        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
+
+        child_process.exec(command, { cwd, timeout: TIMEOUT_MS, maxBuffer: MAX_BUFFER }, (err, stdout, stderr) => {
+            const parts = [stdout, stderr].filter(Boolean);
+            if (err) {
+                if (err.killed) {
+                    parts.push(`\nError: 'bal test' timed out after ${TIMEOUT_MS / 1000} seconds.`);
+                } else if (!stdout && !stderr) {
+                    // OS-level failure (e.g. command not found, maxBuffer exceeded)
+                    parts.push(`\nError: ${err.message}`);
+                }
+            }
+            const output = parts.join('\n').trim();
+            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}, killed: ${err?.killed ?? false}`);
+            resolve({ output });
+        });
+    });
+}
-async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
-    return new Promise((resolve) => {
-        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
-        const command = `${balCmd} test`;
-
-        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
-
-        child_process.exec(command, { cwd }, (err, stdout, stderr) => {
-            const output = [stdout, stderr].filter(Boolean).join('\n').trim();
-
-            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}`);
-            resolve({ output });
-        });
-    });
+async function runBallerinaTests(cwd: string): Promise<TestRunResult> {
+    return new Promise((resolve) => {
+        const balCmd = extension.ballerinaExtInstance.getBallerinaCmd();
+        const command = `${balCmd} test`;
+        const TIMEOUT_MS = 5 * 60 * 1000; // 5 minutes
+        const MAX_BUFFER = 10 * 1024 * 1024; // 10 MB
+
+        console.log(`[TestRunner] Running: ${command} in ${cwd}`);
+
+        child_process.exec(command, { cwd, timeout: TIMEOUT_MS, maxBuffer: MAX_BUFFER }, (err, stdout, stderr) => {
+            const parts = [stdout, stderr].filter(Boolean);
+            if (err) {
+                // Surface OS-level errors (timeout, maxBuffer, command-not-found) that
+                // would otherwise be invisible to the agent.
+                if (err.killed) {
+                    parts.push(`\nError: 'bal test' timed out after ${TIMEOUT_MS / 1000} seconds.`);
+                } else if (!stdout && !stderr) {
+                    parts.push(`\nError: ${err.message}`);
+                }
+            }
+            const output = parts.join('\n').trim();
+
+            console.log(`[TestRunner] Completed. Exit code: ${err?.code ?? 0}, killed: ${err?.killed ?? false}`);
+            resolve({ output });
+        });
+    });
+}
+}
@@ -422,6 +422,11 @@ const AIChat: React.FC = () => {
                     }
                     return newMessages;
                 });
+            } else if (response.toolName === "runTests") {
+                const toolCallId = response?.toolCallId;
+                updateLastMessage((content) =>
+                    content + `\n\n<toolcall id="${toolCallId}" tool="${response.toolName}">Running tests...</toolcall>`
+                );
             }
         } else if (type === "tool_result") {
             if (response.toolName === "LibrarySearchTool") {
@@ -581,6 +586,13 @@ const AIChat: React.FC = () => {
                     }
                     return newMessages;
                 });
+            } else if (response.toolName === "runTests") {
+                const toolCallId = response.toolCallId;
+                if (toolCallId) {
+                    const searchPattern = `<toolcall id="${toolCallId}" tool="${response.toolName}">Running tests...</toolcall>`;
+                    const replacement = `<toolresult id="${toolCallId}" tool="${response.toolName}">Tests completed</toolresult>`;
+                    updateLastMessage((content) => content.replace(searchPattern, replacement));
+                }
             }
         } else if (type === "task_approval_request") {
             if (response.approvalType === "plan") {

@@ -140,6 +140,7 @@ function getGroupCategory(toolNames: (string | undefined)[]): ToolCategory {
     const hasPlanning = names.includes("task_write") || names.includes("TaskWrite");
     const hasConfig = names.includes("ConfigCollector");
     const hasConnector = names.includes("ConnectorGeneratorTool");
+    const hasTestRunner = names.includes("runTests");
 
     if (hasFile && !hasLibrary && !hasDiagnostics) {
         return { running: "Editing code...", done: "Code updated" };
@@ -159,6 +160,9 @@ function getGroupCategory(toolNames: (string | undefined)[]): ToolCategory {
     if (hasConnector) {
         return { running: "Generating connector...", done: "Connector ready" };
     }
+    if (hasTestRunner) {
+        return { running: "Running tests...", done: "Tests completed" };
+    }
     return { running: "Thinking...", done: "Done" };
 }