diff --git a/packages/smithy-json/.changes/next-release/smithy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json b/packages/smithy-json/.changes/next-release/smithy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json new file mode 100644 index 000000000..83454f1b1 --- /dev/null +++ b/packages/smithy-json/.changes/next-release/smithy-json-enhancement-37b9c0663ce84096905fd484478ce94d.json @@ -0,0 +1,4 @@ +{ + "type": "enhancement", + "description": "Fixed string serialization to escape all control characters (U+0000-U+001F) per [RFC 8259](https://www.rfc-editor.org/rfc/rfc8259#section-7), preventing invalid JSON output for multiline and other control-character-containing strings. ([#647](https://github.com/smithy-lang/smithy-python/pull/647))" +} \ No newline at end of file diff --git a/packages/smithy-json/src/smithy_json/_private/serializers.py b/packages/smithy-json/src/smithy_json/_private/serializers.py index c1cd3df70..7146923e4 100644 --- a/packages/smithy-json/src/smithy_json/_private/serializers.py +++ b/packages/smithy-json/src/smithy_json/_private/serializers.py @@ -1,6 +1,7 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import re from base64 import b64encode from collections.abc import Callable, Mapping, Sequence from contextlib import AbstractContextManager @@ -27,6 +28,30 @@ _INF: float = float("inf") _NEG_INF: float = float("-inf") +# RFC 8259 §7: All control characters U+0000 through U+001F MUST be escaped. +_ESCAPE_MAP: dict[str, str] = { + "\\": "\\\\", + '"': '\\"', + "\n": "\\n", + "\r": "\\r", + "\t": "\\t", + "\b": "\\b", + "\f": "\\f", +} +_CHARS_TO_ESCAPE = re.compile(r'[\\"\x00-\x1f]') + + +def _escape_char(match: re.Match[str]) -> str: + c = match.group() + if c in _ESCAPE_MAP: + return _ESCAPE_MAP[c] + return f"\\u{ord(c):04x}" + + +def _escape_string(value: str) -> str: + """Escape a string value per RFC 8259 §7.""" + return _CHARS_TO_ESCAPE.sub(_escape_char, value) + class JSONShapeSerializer(ShapeSerializer): def __init__(self, sink: BytesWriter, settings: JSONSettings) -> None: @@ -271,9 +296,7 @@ def write_document_value( def write_string(self, value: str) -> None: self._sink.write(b'"') - self._sink.write( - value.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8") - ) + self._sink.write(_escape_string(value).encode("utf-8")) self._sink.write(b'"') def write_int(self, value: int) -> None: diff --git a/packages/smithy-json/tests/unit/__init__.py b/packages/smithy-json/tests/unit/__init__.py index 294dc5457..8e666cdb5 100644 --- a/packages/smithy-json/tests/unit/__init__.py +++ b/packages/smithy-json/tests/unit/__init__.py @@ -349,6 +349,12 @@ def _read_optional_map(k: str, d: ShapeDeserializer): (Decimal("1.1"), b"1.1"), (b"foo", b'"Zm9v"'), ("foo", b'"foo"'), + # RFC 8259 §7: control characters must be escaped + ("line 1\nline 2", b'"line 1\\nline 2"'), + ("col 1\tcol 2", b'"col 1\\tcol 2"'), + ("a\rb", b'"a\\rb"'), + ("a\\b", b'"a\\\\b"'), + ('a"b', b'"a\\"b"'), (datetime(2024, 5, 15, tzinfo=UTC), b'"2024-05-15T00:00:00Z"'), (None, b"null"), (["foo"], b'["foo"]'), diff --git a/packages/smithy-json/tests/unit/test_serializers.py b/packages/smithy-json/tests/unit/test_serializers.py index 552c23a0b..fd04bb7a8 100644 --- a/packages/smithy-json/tests/unit/test_serializers.py +++ b/packages/smithy-json/tests/unit/test_serializers.py @@ -1,5 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import json from datetime import datetime from decimal import Decimal from io import BytesIO @@ -82,3 +83,59 @@ def test_json_serializer(given: Any, expected: bytes) -> None: sink.seek(0) actual = sink.read() assert actual == expected + + +def _serialize_string(value: str) -> bytes: + """Serialize a string value through the JSON codec and return raw bytes.""" + sink = BytesIO() + serializer = JSONCodec().create_serializer(sink) + serializer.write_string(STRING, value) + serializer.flush() + sink.seek(0) + return sink.read() + + +class TestStringControlCharEscaping: + """RFC 8259 §7: All control characters U+0000-U+001F must be escaped.""" + + @pytest.mark.parametrize( + "char, escaped", + [ + ("\n", "\\n"), + ("\r", "\\r"), + ("\t", "\\t"), + ("\b", "\\b"), + ("\f", "\\f"), + ], + ) + def test_named_control_chars(self, char: str, escaped: str) -> None: + result = _serialize_string(f"a{char}b") + assert result == f'"a{escaped}b"'.encode() + + def test_all_control_chars_produce_valid_json(self) -> None: + """Every U+0000-U+001F character must be escaped so output is valid JSON.""" + for cp in range(0x20): + value = f"before{chr(cp)}after" + raw = _serialize_string(value) + # Must parse as valid JSON + parsed = json.loads(raw) + assert parsed == value, f"Round-trip failed for U+{cp:04X}" + + def test_null_byte(self) -> None: + result = _serialize_string("a\x00b") + assert result == b'"a\\u0000b"' + + def test_mixed_escapes(self) -> None: + result = _serialize_string('line 1\nline 2\t"quoted"\r\n') + assert result == b'"line 1\\nline 2\\t\\"quoted\\"\\r\\n"' + + def test_existing_backslash_and_quote_still_escaped(self) -> None: + result = _serialize_string('a\\b"c') + assert result == b'"a\\\\b\\"c"' + + def test_serialized_output_is_valid_json(self) -> None: + """Realistic multi-line prompt string produces valid JSON.""" + value = "System: You are helpful.\nUser: Hello\nAssistant:" + raw = _serialize_string(value) + parsed = json.loads(raw) + assert parsed == value