From 7b2a224c97573ea83377ec24f382440e7af7f652 Mon Sep 17 00:00:00 2001 From: Noravee Kanchanavatee Date: Wed, 30 Jul 2025 11:32:33 -0700 Subject: [PATCH 1/3] Modify the logic to extract the delimited block. --- .../structure/json_structure_parser.py | 91 ++++++++++--------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/neuro_san/internals/parsers/structure/json_structure_parser.py b/neuro_san/internals/parsers/structure/json_structure_parser.py index 341d6e22a..164f7e95b 100644 --- a/neuro_san/internals/parsers/structure/json_structure_parser.py +++ b/neuro_san/internals/parsers/structure/json_structure_parser.py @@ -10,9 +10,10 @@ # # END COPYRIGHT +import re from typing import Any from typing import Dict -from typing import List +from typing import Optional from json.decoder import JSONDecodeError from json_repair import loads @@ -45,56 +46,56 @@ def parse_structure(self, content: str) -> Dict[str, Any]: "{": "}", } - for start_delim, end_delim in delimiters.items(): - if start_delim in content: + meat, self.remainder = self._extract_delimited_block(content, delimiters) - # Note: This code assumes we only have one delimited JSON structure to parse - # within the content. + # Attempt parsing the structure from the meat + structure: Dict[str, Any] = None + if meat: + try: + structure = loads(meat) + if not isinstance(structure, Dict): + # json_repair seems to sometimes return an empty string if there is nothing + # for it to grab onto. + structure = None + except JSONDecodeError: + # Couldn't parse + self.remainder = None + + return structure + + def _extract_delimited_block(self, text: str, delimiters: Dict[str, str]) -> tuple[Optional[str], str]: + """ + Extracts a block of text from the input string "text" that is enclosed between any + of the provided delimiter pairs. Returns a tuple of: + - The extracted main block with delimiters, or None if no match + - The remaining string with the block removed and extra whitespace collapsed - # Well-formed per delimiter - split_header: List[str] = content.split(start_delim) + :param text: The input string potentially containing a delimited block + :param delimiters: A dictionary mapping starting delimiters to ending delimiters - # Start the remainder off with everything before the json backtick business - self.remainder = split_header[0] + :return: A tuple of (main block content, remainder string) + """ + # Try each delimiter pair in order + for start, end in delimiters.items(): + # Build a regex pattern to find content between start and end delimiters + # - re.escape ensures special characters like "{" are treated literally + # - (.*) is a greedy match for any characters between the delimiters + pattern = re.escape(start) + r"(.*)" + re.escape(end) - # Find the end of the backticks if any - if end_delim != start_delim: - split_footer: List[str] = split_header[-1].split(end_delim) - meat = split_footer[0] - if len(split_footer) > 1: - # Add to the remainder anything outside the delimiting backticks - self.remainder += split_footer[-1] - else: - meat = split_header[1] - if len(split_header) > 2: - # Add the remaining with the end delimiter. - # We are only parsing the first we find. - self.remainder += end_delim.join(split_header[2:]) + # Perform regex search across multiple lines if needed (DOTALL allows "." to match newlines) + match = re.search(pattern, text, re.DOTALL) - # Meat is everything in between, maybe with start and end delims on either end. - meat = meat.strip() + if match: + # Extract the matched content (including the delimiters), removing leading/trailing whitespace + main = match.group(0).strip() - # Maybe add the delimiters back to help parsing the meat. - use_delims: bool = start_delim != end_delim - if use_delims: - meat = f"{start_delim}{meat}{end_delim}" + # Remove the matched block (including delimiters) from the input string + remainder = text[:match.start()] + text[match.end():] - break + # Clean up extra whitespace in the remainder (collapse multiple spaces) + remainder = re.sub(r"\s+", " ", remainder).strip() - # Attempt parsing the structure from the meat - structure: Dict[str, Any] = None - try: - structure = loads(meat) - if not isinstance(structure, Dict): - # json_repair seems to sometimes return an empty string if there is nothing - # for it to grab onto. - structure = None - except JSONDecodeError: - # Couldn't parse - self.remainder = None - - # Strip any whitespace of the ends of any remainder. - if self.remainder is not None: - self.remainder = self.remainder.strip() + return main, remainder - return structure + # If no matching delimiters were found, return None and the full cleaned-up input + return None, text.strip() From 21f659c06107c71c12f1f545e18ac395156a6386 Mon Sep 17 00:00:00 2001 From: Noravee Kanchanavatee Date: Wed, 30 Jul 2025 12:10:50 -0700 Subject: [PATCH 2/3] - modify the json parser to return None for remainder if structure is None and do not clean up remainder - add test cases for nested dict in the unit test --- .../structure/json_structure_parser.py | 37 ++++---- .../structure/test_json_structure_parser.py | 85 +++++++++++++++++++ 2 files changed, 104 insertions(+), 18 deletions(-) diff --git a/neuro_san/internals/parsers/structure/json_structure_parser.py b/neuro_san/internals/parsers/structure/json_structure_parser.py index 164f7e95b..9b7887bff 100644 --- a/neuro_san/internals/parsers/structure/json_structure_parser.py +++ b/neuro_san/internals/parsers/structure/json_structure_parser.py @@ -11,6 +11,7 @@ # END COPYRIGHT import re +from re import Match from typing import Any from typing import Dict from typing import Optional @@ -50,16 +51,19 @@ def parse_structure(self, content: str) -> Dict[str, Any]: # Attempt parsing the structure from the meat structure: Dict[str, Any] = None - if meat: - try: - structure = loads(meat) - if not isinstance(structure, Dict): - # json_repair seems to sometimes return an empty string if there is nothing - # for it to grab onto. - structure = None - except JSONDecodeError: - # Couldn't parse - self.remainder = None + + try: + structure = loads(meat) + if not isinstance(structure, Dict): + # json_repair seems to sometimes return an empty string if there is nothing + # for it to grab onto. + structure = None + except JSONDecodeError: + # Couldn't parse + self.remainder = None + except TypeError: + # meat is None + self.remainder = None return structure @@ -80,22 +84,19 @@ def _extract_delimited_block(self, text: str, delimiters: Dict[str, str]) -> tup # Build a regex pattern to find content between start and end delimiters # - re.escape ensures special characters like "{" are treated literally # - (.*) is a greedy match for any characters between the delimiters - pattern = re.escape(start) + r"(.*)" + re.escape(end) + pattern: str = re.escape(start) + r"(.*)" + re.escape(end) # Perform regex search across multiple lines if needed (DOTALL allows "." to match newlines) - match = re.search(pattern, text, re.DOTALL) + match: Match[str] = re.search(pattern, text, re.DOTALL) if match: # Extract the matched content (including the delimiters), removing leading/trailing whitespace - main = match.group(0).strip() + main: str = match.group(0).strip() # Remove the matched block (including delimiters) from the input string - remainder = text[:match.start()] + text[match.end():] - - # Clean up extra whitespace in the remainder (collapse multiple spaces) - remainder = re.sub(r"\s+", " ", remainder).strip() + remainder: str = text[:match.start()] + text[match.end():] - return main, remainder + return main, remainder.strip() # If no matching delimiters were found, return None and the full cleaned-up input return None, text.strip() diff --git a/tests/neuro_san/internals/parsers/structure/test_json_structure_parser.py b/tests/neuro_san/internals/parsers/structure/test_json_structure_parser.py index cbde77792..327cc75d4 100644 --- a/tests/neuro_san/internals/parsers/structure/test_json_structure_parser.py +++ b/tests/neuro_san/internals/parsers/structure/test_json_structure_parser.py @@ -308,3 +308,88 @@ def test_just_backtick_no_remainder(self): remainder: str = parser.get_remainder() self.assertIsNotNone(remainder) self.assertEqual(remainder, "") + + def test_json_backtick_nested_no_remainder(self): + """ + Tests standard json backtick/markdown in response. + """ + test: str = """ +```json +{ + "key_1": "value_1", + "key_2": { + "key_3": "value_3" + } +} +``` +""" + parser = JsonStructureParser() + + structure: Dict[str, Any] = parser.parse_structure(test) + self.assertIsNotNone(structure) + value_1: str = structure.get("key_1") + self.assertEqual(value_1, "value_1") + value_2: Dict[str, str] = structure.get("key_2") + self.assertEqual(value_2, {"key_3": "value_3"}) + value_3: str = structure.get("key_2").get("key_3") + self.assertEqual(value_3, "value_3") + + remainder: str = parser.get_remainder() + self.assertIsNotNone(remainder) + self.assertEqual(remainder, "") + + def test_no_backtick_nested_no_remainder(self): + """ + Tests no backtick/markdown in response. + """ + test: str = """ +{ + "key_1": "value_1", + "key_2": { + "key_3": "value_3" + } +} +""" + parser = JsonStructureParser() + + structure: Dict[str, Any] = parser.parse_structure(test) + self.assertIsNotNone(structure) + value_1: str = structure.get("key_1") + self.assertEqual(value_1, "value_1") + value_2: Dict[str, str] = structure.get("key_2") + self.assertEqual(value_2, {"key_3": "value_3"}) + value_3: str = structure.get("key_2").get("key_3") + self.assertEqual(value_3, "value_3") + + remainder: str = parser.get_remainder() + self.assertIsNotNone(remainder) + self.assertEqual(remainder, "") + + def test_just_backtick_nested_no_remainder(self): + """ + Tests no backtick/markdown in response. + """ + test: str = """ +``` +{ + "key_1": "value_1", + "key_2": { + "key_3": "value_3" + } +} +``` +""" + parser = JsonStructureParser() + + structure: Dict[str, Any] = parser.parse_structure(test) + self.assertIsNotNone(structure) + value_1: str = structure.get("key_1") + self.assertEqual(value_1, "value_1") + value_2: Dict[str, str] = structure.get("key_2") + self.assertEqual(value_2, {"key_3": "value_3"}) + value_3: str = structure.get("key_2").get("key_3") + self.assertEqual(value_3, "value_3") + + remainder: str = parser.get_remainder() + self.assertIsNotNone(remainder) + self.assertEqual(remainder, "") From 58772837aced19fd3077e770a32d5ac4ca81ce0d Mon Sep 17 00:00:00 2001 From: Noravee Kanchanavatee Date: Wed, 30 Jul 2025 14:05:49 -0700 Subject: [PATCH 3/3] minor --- neuro_san/internals/parsers/structure/json_structure_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neuro_san/internals/parsers/structure/json_structure_parser.py b/neuro_san/internals/parsers/structure/json_structure_parser.py index 9b7887bff..6cfda5319 100644 --- a/neuro_san/internals/parsers/structure/json_structure_parser.py +++ b/neuro_san/internals/parsers/structure/json_structure_parser.py @@ -15,6 +15,7 @@ from typing import Any from typing import Dict from typing import Optional +from typing import Tuple from json.decoder import JSONDecodeError from json_repair import loads @@ -67,7 +68,7 @@ def parse_structure(self, content: str) -> Dict[str, Any]: return structure - def _extract_delimited_block(self, text: str, delimiters: Dict[str, str]) -> tuple[Optional[str], str]: + def _extract_delimited_block(self, text: str, delimiters: Dict[str, str]) -> Tuple[Optional[str], str]: """ Extracts a block of text from the input string "text" that is enclosed between any of the provided delimiter pairs. Returns a tuple of: