From f7bc20290c07c96ff5c6ccb257a1ea71198ac658 Mon Sep 17 00:00:00 2001 From: Jason Weddington Date: Fri, 27 Feb 2026 05:50:27 -0800 Subject: [PATCH 1/2] =?UTF-8?q?fix(smithy-json):=20escape=20control=20char?= =?UTF-8?q?acters=20in=20write=5Fstring=20per=20RFC=208259=20=C2=A77?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StreamingJSONEncoder.write_string() only escaped backslash and double quote. Control characters U+0000–U+001F (newline, tab, CR, etc.) were written as raw bytes, producing invalid JSON that causes SerializationException on API calls with multi-line string fields. Use a regex to escape all control characters: named escapes for common ones (\n, \r, \t, \b, \f) and \uXXXX for the rest. --- .../src/smithy_json/_private/serializers.py | 29 +++++++++- packages/smithy-json/tests/unit/__init__.py | 6 ++ .../tests/unit/test_serializers.py | 57 +++++++++++++++++++ 3 files changed, 89 insertions(+), 3 deletions(-) diff --git a/packages/smithy-json/src/smithy_json/_private/serializers.py b/packages/smithy-json/src/smithy_json/_private/serializers.py index c1cd3df70..7146923e4 100644 --- a/packages/smithy-json/src/smithy_json/_private/serializers.py +++ b/packages/smithy-json/src/smithy_json/_private/serializers.py @@ -1,6 +1,7 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import re from base64 import b64encode from collections.abc import Callable, Mapping, Sequence from contextlib import AbstractContextManager @@ -27,6 +28,30 @@ _INF: float = float("inf") _NEG_INF: float = float("-inf") +# RFC 8259 §7: All control characters U+0000 through U+001F MUST be escaped. +_ESCAPE_MAP: dict[str, str] = { + "\\": "\\\\", + '"': '\\"', + "\n": "\\n", + "\r": "\\r", + "\t": "\\t", + "\b": "\\b", + "\f": "\\f", +} +_CHARS_TO_ESCAPE = re.compile(r'[\\"\x00-\x1f]') + + +def _escape_char(match: re.Match[str]) -> str: + c = match.group() + if c in _ESCAPE_MAP: + return _ESCAPE_MAP[c] + return f"\\u{ord(c):04x}" + + +def _escape_string(value: str) -> str: + """Escape a string value per RFC 8259 §7.""" + return _CHARS_TO_ESCAPE.sub(_escape_char, value) + class JSONShapeSerializer(ShapeSerializer): def __init__(self, sink: BytesWriter, settings: JSONSettings) -> None: @@ -271,9 +296,7 @@ def write_document_value( def write_string(self, value: str) -> None: self._sink.write(b'"') - self._sink.write( - value.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8") - ) + self._sink.write(_escape_string(value).encode("utf-8")) self._sink.write(b'"') def write_int(self, value: int) -> None: diff --git a/packages/smithy-json/tests/unit/__init__.py b/packages/smithy-json/tests/unit/__init__.py index 294dc5457..8e666cdb5 100644 --- a/packages/smithy-json/tests/unit/__init__.py +++ b/packages/smithy-json/tests/unit/__init__.py @@ -349,6 +349,12 @@ def _read_optional_map(k: str, d: ShapeDeserializer): (Decimal("1.1"), b"1.1"), (b"foo", b'"Zm9v"'), ("foo", b'"foo"'), + # RFC 8259 §7: control characters must be escaped + ("line 1\nline 2", b'"line 1\\nline 2"'), + ("col 1\tcol 2", b'"col 1\\tcol 2"'), + ("a\rb", b'"a\\rb"'), + ("a\\b", b'"a\\\\b"'), + ('a"b', b'"a\\"b"'), (datetime(2024, 5, 15, tzinfo=UTC), b'"2024-05-15T00:00:00Z"'), (None, b"null"), (["foo"], b'["foo"]'), diff --git a/packages/smithy-json/tests/unit/test_serializers.py b/packages/smithy-json/tests/unit/test_serializers.py index 552c23a0b..fd04bb7a8 100644 --- a/packages/smithy-json/tests/unit/test_serializers.py +++ b/packages/smithy-json/tests/unit/test_serializers.py @@ -1,5 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import json from datetime import datetime from decimal import Decimal from io import BytesIO @@ -82,3 +83,59 @@ def test_json_serializer(given: Any, expected: bytes) -> None: sink.seek(0) actual = sink.read() assert actual == expected + + +def _serialize_string(value: str) -> bytes: + """Serialize a string value through the JSON codec and return raw bytes.""" + sink = BytesIO() + serializer = JSONCodec().create_serializer(sink) + serializer.write_string(STRING, value) + serializer.flush() + sink.seek(0) + return sink.read() + + +class TestStringControlCharEscaping: + """RFC 8259 §7: All control characters U+0000-U+001F must be escaped.""" + + @pytest.mark.parametrize( + "char, escaped", + [ + ("\n", "\\n"), + ("\r", "\\r"), + ("\t", "\\t"), + ("\b", "\\b"), + ("\f", "\\f"), + ], + ) + def test_named_control_chars(self, char: str, escaped: str) -> None: + result = _serialize_string(f"a{char}b") + assert result == f'"a{escaped}b"'.encode() + + def test_all_control_chars_produce_valid_json(self) -> None: + """Every U+0000-U+001F character must be escaped so output is valid JSON.""" + for cp in range(0x20): + value = f"before{chr(cp)}after" + raw = _serialize_string(value) + # Must parse as valid JSON + parsed = json.loads(raw) + assert parsed == value, f"Round-trip failed for U+{cp:04X}" + + def test_null_byte(self) -> None: + result = _serialize_string("a\x00b") + assert result == b'"a\\u0000b"' + + def test_mixed_escapes(self) -> None: + result = _serialize_string('line 1\nline 2\t"quoted"\r\n') + assert result == b'"line 1\\nline 2\\t\\"quoted\\"\\r\\n"' + + def test_existing_backslash_and_quote_still_escaped(self) -> None: + result = _serialize_string('a\\b"c') + assert result == b'"a\\\\b\\"c"' + + def test_serialized_output_is_valid_json(self) -> None: + """Realistic multi-line prompt string produces valid JSON.""" + value = "System: You are helpful.\nUser: Hello\nAssistant:" + raw = _serialize_string(value) + parsed = json.loads(raw) + assert parsed == value From 4b70d2ef2ec3e0db4954d84538caed13f6a2835c Mon Sep 17 00:00:00 2001 From: Jason Weddington Date: Fri, 27 Feb 2026 10:42:03 -0800 Subject: [PATCH 2/2] adding changelog entry --- ...thy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 packages/smithy-json/.changes/next-release/smithy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json diff --git a/packages/smithy-json/.changes/next-release/smithy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json b/packages/smithy-json/.changes/next-release/smithy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json new file mode 100644 index 000000000..83454f1b1 --- /dev/null +++ b/packages/smithy-json/.changes/next-release/smithy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json @@ -0,0 +1,4 @@ +{ + "type": "enhancement", + "description": "Fixed string serialization to escape all control characters (U+0000-U+001F) per [RFC 8259](https://www.rfc-editor.org/rfc/rfc8259#section-7), preventing invalid JSON output for multiline and other control-character-containing strings. ([#647](https://github.com/smithy-lang/smithy-python/pull/647))" +} \ No newline at end of file