cognizant-ai-lab · vince-leaf · Apr 16, 2025 · Apr 26, 2025 · Apr 26, 2025 · Apr 26, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -35,8 +35,15 @@ jobs:
       - name: Run flake8
         run: flake8
 
-      - name: Run pytest (excluding integration tests)
-        run: pytest --verbose -m "not integration" --timer-top-n 10
+      - name: Run E2E Tests (only tests/e2e/)
+        run: pytest tests/e2e/tests/ --verbose --connection direct --thinking-file --repeat 1 -n auto
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          AGENT_TOOL_PATH: "./neuro_san/coded_tools"
+          PYTHONPATH: ${{ env.PYTHONPATH }}:"."
+
+      - name: Run pytest Run All Other Tests (excluding integration and e2e)
+        run: pytest --verbose -m "not integration and not e2e" --ignore=tests/e2e/ --timer-top-n 10
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           AGENT_TOOL_PATH: "./neuro_san/coded_tools"

diff --git a/neuro_san/coded_tools/music_nerd_pro/accounting.py b/neuro_san/coded_tools/music_nerd_pro/accounting.py
@@ -30,7 +30,7 @@ def invoke(self, args: Dict[str, Any], sly_data: Dict[str, Any]) -> Dict[str, An
         running_cost: float = float(args.get("running_cost"))
 
         # Increment the running cost
-        updated_running_cost: float = running_cost + 1.0
+        updated_running_cost: float = running_cost + 3.0
 
         tool_response = {
             "running_cost": updated_running_cost

diff --git a/neuro_san/registries/music_nerd_pro.hocon b/neuro_san/registries/music_nerd_pro.hocon
@@ -50,7 +50,11 @@ You’re equal parts playlist curator, music historian, and pop culture mythbust
 This service comes for a fee. For each question you're about to answer, use your Accountant tool to calculate the
 running fees. 
 
-Return your answer and the running cost in a JSON message.
+This service comes at a cost. For every question:
+1. Use your Accountant tool to calculate the updated running cost.
+2. Return your response in **two parts**:
+   - First, give your full music answer as plain text.
+   - Then, on a **separate line**, return a **valid JSON object** with the updated cost only.
 """,
             "tools": ["Accountant"]
         },

diff --git a/requirements-build.txt b/requirements-build.txt
@@ -8,6 +8,10 @@ timeout-decorator==0.5.0
 coverage==7.6.1
 pytest-cov==5.0.0
 parameterized
+pexpect
+pyhocon
+pytest-xdist
+pytest-timeout
 
 # Code quality
 flake8==7.1.1

diff --git a/tests/e2e/README.md b/tests/e2e/README.md
@@ -0,0 +1,114 @@
+# 🧪 End-to-End Agent Testing Framework
+
+This project provides an extensible, reusable **pytest**-based test system to validate AI agent behavior through real CLI interactions.
+
+It supports:
+- Running **multiple connections** (`grpc`, `http`, `direct`)
+- **Parallel execution** with **pytest-xdist**
+- Optional **thinking file capture** for agent internals
+- Config-driven prompts using **HOCON** files
+
+---
+
+## 📦 Project Structure
+
+```bash
+e2e/
+├── README.md              # This documentation
+├── configs/                # Static agent configuration
+│   └── config.hocon
+├── conftest.py             # Pytest customizations (CLI args, test discovery)
+├── pytest.ini              # Pytest settings
+├── requirements.txt        # Python dependencies
+├── test_cases_data/        # Test data for each agent
+│   └── mnpt_data.hocon
+├── tests/                  # Test case source files
+│   └── test_music_nerd_pro.py
+└── utils/                  # Helper modules (parsing, building commands, etc.)
+    ├── mnpt_hocon_loader.py
+    ├── mnpt_output_parser.py
+    ├── mnpt_test_runner.py
+    ├── thinking_file_builder.py
+    └── verifier.py
+```
+
+---
+
+## 🚀 Running Tests
+
+### Install Dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+### Basic Test Command
+
+Run a test (default: **all connections**):
+
+```bash
+pytest tests/ --verbose
+```
+
+Run for specific connection only:
+
+```bash
+pytest tests/ --connection grpc --verbose
+```
+
+Run and enable thinking file output:
+
+```bash
+pytest tests/ --thinking-file --verbose
+```
+
+Enable parallel test execution:
+
+```bash
+pytest tests/ --connection grpc --repeat 5 --thinking-file -n auto --verbose
+```
+
+> 💡 When using `-n auto`, each repeat runs across multiple CPU cores.
+
+---
+
+## ⚙️ CLI Options
+
+| Option            | Description |
+|:------------------|:------------|
+| `--connection`     | Run tests only for a specific connection (e.g., `grpc`, `http`, `direct`). |
+| `--repeat`         | Repeat each test multiple times. |
+| `--thinking-file`  | Save the agent's internal "thinking" to a temp directory during the test. |
+
+---
+
+# 🧠 Agent: MusicNerdPro Test (test_music_nerd_pro.py)
+
+This suite tests the `music_nerd_pro` agent over all connection types.
+
+### Test Logic
+
+- Load prompt/expected outputs from **HOCON** config files
+- Spawn a CLI agent process
+- Send user questions
+- Verify that:
+  - Correct keyword appears in the response
+  - Correct cost value is returned
+
+### Related Files
+
+| File | Purpose |
+|:-----|:--------|
+| `tests/test_music_nerd_pro.py` | Main test case (pytest function) |
+| `test_cases_data/mnpt_data.hocon` | Prompt/expected answer definitions |
+| `configs/config.hocon` | Static agent config (connections list) |
+| `utils/*.py` | Reusable helpers for all agent tests |
+
+---
+
+# 📝 Notes
+
+- **Thinking files** are stored under `/private/tmp/agent_thinking/`
+- If `-n auto` is used, **worker-specific** folders are created (e.g., `run_gw0_1`).
+- **PEXPECT** is used to fully simulate CLI typing behavior.
+- Future agents can be easily added following the same pattern as MusicNerdPro!
diff --git a/tests/e2e/configs/config.hocon b/tests/e2e/configs/config.hocon
@@ -0,0 +1,8 @@
+# config.hocon
+# Agent config & connection setup
+
+connection = ["direct", "grpc", "http"]
+agent = [music_nerd_pro]
+
+model_llm = ["gpt-4o", "llama3.1"]
+
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
@@ -0,0 +1,108 @@
+# conftest.py
+# ------------------------------------------------------------------------
+# Provides custom CLI flags, dynamic test generation, and environment setup.
+# Pytest configuration to share like MusicNerdPro test
+# ------------------------------------------------------------------------
+
+import pytest
+import os
+from pyhocon import ConfigFactory
+
+# ------------------------------------------------------------------------------
+# Constants
+# ------------------------------------------------------------------------------
+
+# Directory where agent CLI thinking files will be written (optional feature)
+THINKING_FILE_PATH = "/private/tmp/agent_thinking"
+
+# Static agent config (HOCON) loaded once for all tests
+CONFIG_HOCON_PATH = os.path.join(os.path.dirname(__file__), "configs", "config.hocon")
+config = ConfigFactory.parse_file(CONFIG_HOCON_PATH)
+
+# ------------------------------------------------------------------------------
+# Hooks
+# ------------------------------------------------------------------------------
+
+
+def pytest_configure(config):
+    """
+    Prints custom environment info when pytest starts.
+    Helps verify environment settings.
+    """
+    print("\nCustom Environment Info")
+    print(f"thinking-file path      : {THINKING_FILE_PATH}")
+
+
+def pytest_addoption(parser):
+    """
+    Adds custom command-line options for pytest to control the test suite:
+    --connection    -> Filter tests by specific connection method (direct/grpc/http)
+    --repeat        -> Repeat the same test multiple times (for stability/reliability)
+    --thinking-file -> Enable writing out agent thinking_file logs during test
+    """
+    group = parser.getgroup("custom options")
+    group.addoption(
+        "--connection",
+        action="store",
+        default=None,
+        help="Specify a connection name to test (e.g., direct, grpc, http). If omitted, all will be tested."
+    )
+    group.addoption(
+        "--repeat",
+        action="store",
+        type=int,
+        default=1,
+        help="Number of times to repeat each test (for stress or reliability testing)."
+    )
+    group.addoption(
+        "--thinking-file",
+        action="store_true",
+        default=False,
+        help="If enabled, agent will write a thinking_file log per test case (grpc/http/direct)."
+    )
+
+
+def pytest_generate_tests(metafunc):
+    """
+    Dynamically parameterizes the tests based on the connection(s) and repetition requested.
+
+    Example:
+    --connection grpc --repeat 3
+    → Runs 3 tests against 'grpc' connection.
+
+    --repeat 2 (with no connection)
+    → Runs 2 tests for each connection (direct, grpc, http).
+
+    This auto-expands into (connection_name, repeat_index) fixture pairs.
+    """
+    if "connection_name" in metafunc.fixturenames:
+        all_connections = load_connections()
+        selected_connection = metafunc.config.getoption("connection")
+        repeat = metafunc.config.getoption("repeat")
+
+        # Filter if a specific connection is selected
+        if selected_connection:
+            if selected_connection not in all_connections:
+                raise ValueError(f"Connection '{selected_connection}' not found in config: {all_connections}")
+            all_connections = [selected_connection]
+
+        # Generate combinations of (connection_name, repeat_index)
+        test_params = [
+            pytest.param(conn, i, id=f"{conn}_run{i+1}")
+            for conn in all_connections
+            for i in range(repeat)
+        ]
+
+        # Parametrize the test function
+        metafunc.parametrize("connection_name, repeat_index", test_params)
+
+# ------------------------------------------------------------------------------
+# Utilities
+# ------------------------------------------------------------------------------
+
+
+def load_connections():
+    """
+    Loads the list of supported connection names from the HOCON config file.
+    """
+    return config.get("connection")
diff --git a/tests/e2e/pytest.ini b/tests/e2e/pytest.ini
@@ -0,0 +1,5 @@
+# pytest.ini
+[pytest]
+filterwarnings =
+    ignore:.*use of forkpty.*:DeprecationWarning:pty
+
diff --git a/tests/e2e/requirements.txt b/tests/e2e/requirements.txt
@@ -0,0 +1,6 @@
+pexpect
+pyhocon
+pytest
+pytest-xdist
+pytest-timeout
+pytest-timer
diff --git a/tests/e2e/test_cases_data/mnpt_data.hocon b/tests/e2e/test_cases_data/mnpt_data.hocon
@@ -0,0 +1,29 @@
+# test_data.hocon
+# Input/output test pairs
+
+test = [
+  {
+    input_1: {
+      user_text: "Who did yellow submarine?"
+      answer: {
+        type_match: "keyword"
+        word: "Beatles"
+        cost: "3.0"
+      }
+    }
+  },
+  {
+    input_2: {
+      user_text: "Where were they from?"
+      answer: {
+        type_match: "keyword"
+        word: "Liverpool"
+        cost: "6.0"
+      }
+    }
+  },
+  {
+    input_done: "quit"
+  }
+]
+
diff --git a/tests/e2e/tests/test_music_nerd_pro.py b/tests/e2e/tests/test_music_nerd_pro.py
@@ -0,0 +1,33 @@
+# test_music_nerd_pro.py
+# ---------------------------------------------------------
+# Parametrized E2E test case that drives CLI interaction tests
+# ---------------------------------------------------------
+
+import pytest
+from utils.mnpt_hocon_loader import extract_test_values
+from utils.mnpt_test_runner import run_test
+
+
+@pytest.mark.e2e
+@pytest.mark.timeout(120)
+def test_run_connection(connection_name, repeat_index, request):
+    """
+    End-to-end test for the music_nerd_pro agent across different connections.
+
+    This test:
+    - Dynamically parametrizes across multiple connections (e.g., direct, grpc, http).
+    - Supports repeated test runs via `repeat_index`.
+    - Optionally uses a 'thinking file' if the --thinking-file pytest option is passed.
+    """
+
+    # Retrieve custom CLI option for thinking file usage
+    use_thinking_file = request.config.getoption("--thinking-file")
+
+    # Extract required test values for the given connection
+    result = extract_test_values(connection_name)
+
+    # Defensive check (optional but good practice)
+    assert result is not None, f"Failed to extract test values for connection: {connection_name}"
+
+    # Execute the CLI-based test
+    run_test(*result, repeat_index, use_thinking_file)