Perf: optimize regex matching and streaming response handling.

Mateusz · Mateusz · commit 7a52648e187f · 2025-12-30T22:15:45.000+01:00
- EditPrecisionResponseMiddleware: Use combined regex for O(1) fast-fail check.
- BackendStreamingResponseHandler: Avoid json.dumps/dict copies in hot loops.
- Fix test_backend_streaming_response_handler.py mock compatibility.
diff --git a/src/core/services/backend_request_manager/streaming_response_handler.py b/src/core/services/backend_request_manager/streaming_response_handler.py
@@ -115,7 +115,19 @@ def _extract_text_from_chunk(self, chunk: ProcessedResponse) -> str:
             except UnicodeDecodeError:
                 return content.decode("utf-8", errors="ignore")
         if isinstance(content, dict):
-            # Use dict() to safely handle StopChunkWithUsage which is a dict subclass
+            # OPTIMIZATION: Extract from standard OpenAI format directly to avoid expensive json.dumps
+            # This is on the hot path for every token (loop detection + meaning check)
+            if "choices" in content and isinstance(content["choices"], list):
+                choices = content["choices"]
+                if choices and isinstance(choices[0], dict):
+                    # Try delta (stream) or message (non-stream)
+                    delta = choices[0].get("delta") or choices[0].get("message")
+                    if isinstance(delta, dict) and "content" in delta:
+                        val = delta["content"]
+                        if val is not None:
+                            return str(val)
+
+            # Fallback: Use dict() to safely handle StopChunkWithUsage which is a dict subclass
             return json.dumps(dict(content))
         return str(content) if content is not None else ""
 
@@ -695,7 +707,7 @@ async def monitored_stream() -> AsyncIterator[ProcessedResponse]:
                                         ],
                                     }
                                     yield ProcessedResponse(
-                                        content=cancellation_payload,
+                                        content=cast(Any, cancellation_payload),
                                         metadata={
                                             "is_cancellation": True,
                                             "is_done": True,
@@ -730,9 +742,13 @@ async def attach_metadata_stream() -> AsyncIterator[ProcessedResponse]:
 
             async for chunk in monitored_stream():
                 if isinstance(chunk, ProcessedResponse):
-                    processed_metadata: dict[str, JsonValue] = dict(
-                        chunk.metadata or {}
-                    )
+                    # OPTIMIZATION: Modify metadata in-place to avoid copying dicts per-token
+                    if chunk.metadata is None:
+                        chunk.metadata = {}
+
+                    # We own this chunk (transient), so in-place modification is safe and faster
+                    processed_metadata = chunk.metadata  # type: ignore
+
                     if original_request_payload is not None:
                         processed_metadata.setdefault(
                             "original_request", original_request_payload
@@ -744,7 +760,7 @@ async def attach_metadata_stream() -> AsyncIterator[ProcessedResponse]:
                         processed_metadata.setdefault(
                             "client_os", cast(JsonValue, processing_context.client_os)
                         )
-                    chunk.metadata = processed_metadata
+                    # No need to re-assign chunk.metadata as we modified it in place
                     yield chunk
                 else:
                     metadata: dict[str, JsonValue] = {}
diff --git a/src/core/services/edit_precision_response_middleware.py b/src/core/services/edit_precision_response_middleware.py
@@ -57,6 +57,7 @@ def __init__(self, app_state: IApplicationState, priority: int = 10) -> None:
         self._app_state = app_state
         self._compiled = list(self._DEFAULT_PATTERNS)
         self._last_stream_ids: dict[str, str] = {}
+        self._combined_pattern: re.Pattern[str] | None = None
 
         try:
             from src.core.services.edit_precision_patterns import get_response_patterns
@@ -88,6 +89,29 @@ def __init__(self, app_state: IApplicationState, priority: int = 10) -> None:
                     exc_info=True,
                 )
 
+        # Pre-compile a combined regex for fast-fail checks
+        # This converts O(N) regex searches into O(1) for the common case (no errors)
+        try:
+            pattern_strings = []
+            for p in self._compiled:
+                if hasattr(p, "pattern"):
+                    pattern_strings.append(p.pattern)
+                else:
+                    pattern_strings.append(str(p))
+
+            if pattern_strings:
+                # Use non-capturing groups for safety
+                combined = "|".join(f"(?:{p})" for p in pattern_strings)
+                self._combined_pattern = re.compile(combined, re.IGNORECASE | re.DOTALL)
+            else:
+                self._combined_pattern = None
+        except Exception as err:
+            if self._logger.isEnabledFor(logging.WARNING):
+                self._logger.warning(
+                    "Failed to compile combined edit precision pattern: %s", err
+                )
+            self._combined_pattern = None
+
     @staticmethod
     def _extract_text_from_chunk(chunk: dict) -> str:
         """Extract text content from an OpenAI-format streaming chunk."""
@@ -141,7 +165,15 @@ def _process_response(
 
         matched_pattern: str | None = None
         if combined_text:
-            for p in self._compiled:
+            # OPTIMIZATION: Use combined pattern for O(1) fast-fail check
+            # If combined pattern exists and doesn't match, we can skip individual checks
+            should_scan = True
+            if self._combined_pattern and not self._combined_pattern.search(
+                combined_text
+            ):
+                should_scan = False
+
+            for p in self._compiled if should_scan else []:
                 try:
                     if p.search(combined_text):
                         matched_pattern = getattr(p, "pattern", None) or str(p)
diff --git a/tests/unit/core/services/test_backend_streaming_response_handler.py b/tests/unit/core/services/test_backend_streaming_response_handler.py
@@ -952,6 +952,17 @@ async def test_attaches_client_os_to_chunks(
             processed_stream
         )
 
+        # Mock loop detector and Angel verifier
+        mock_loop_detector = MagicMock(spec=ILoopDetector)
+        mock_loop_detector.process_chunk.return_value = None
+        mock_loop_detector_factory.create.return_value = mock_loop_detector
+
+        async def passthrough_stream(request, stream, context, request_context=None):
+            async for chunk in stream:
+                yield chunk
+
+        mock_angel_stream_verifier.verify_or_passthrough = passthrough_stream
+
         # Act
         result = await handler.handle(
             stream=stream_envelope,