Fix decode_array double-reading length bytes for arrays with 24+ items (#479)

cffls · web-flow · commit 1d2e82510058 · 2026-02-27T19:30:33.000-08:00
The custom decode_array override in serialization.py called _decode_length
to check for indefinite-length arrays, then delegated to the original
decode_array which called _decode_length again. For arrays with fewer than
24 items, the length is encoded directly in the subtype (no stream bytes
consumed), so the double call was harmless. For 24+ items, CBOR uses
multi-byte length encoding (e.g. 98 18 for 24 items) and _decode_length
reads from the stream — the second call consumed actual array content as
a length byte, corrupting the decode.

Replace the _decode_length call with a simple subtype == 31 check, which
is sufficient to detect indefinite-length arrays without consuming any
bytes from the stream.

This bug only affected cbor2pure, not the cbor2 C extension.
diff --git a/pycardano/serialization.py b/pycardano/serialization.py
@@ -196,9 +196,9 @@ def wrapper(cls, value: Primitive):
 
 def decode_array(self, subtype: int) -> Sequence[Any]:
     # Major tag 4
-    length = self._decode_length(subtype, allow_indefinite=True)
-
-    if length is None:
+    if subtype == 31:
+        # Indefinite length array — delegate to the original decoder, then wrap
+        # the result in IndefiniteFrozenList to preserve indefinite encoding.
         ret = IndefiniteFrozenList(list(self.decode_array(subtype=subtype)))
         ret.freeze()
         return ret
diff --git a/test/pycardano/test_serialization.py b/test/pycardano/test_serialization.py
@@ -1134,6 +1134,38 @@ class MyTest(ArrayCBORSerializable):
     assert isinstance(MyTest.from_cbor(a.to_cbor()).a, IndefiniteList)
 
 
+def test_decode_array_with_24_or_more_items():
+    """Test that definite-length arrays with 24+ items decode correctly.
+
+    Regression test for a bug where the custom decode_array override called
+    _decode_length (consuming stream bytes), then delegated to the original
+    decode_array which called _decode_length again. For arrays with < 24 items
+    the length is encoded in the subtype itself (no extra bytes), so the double
+    call was harmless. For 24+ items, CBOR uses multi-byte length encoding
+    (e.g. 98 18 for 24 items) and the second _decode_length call consumed
+    actual array content, corrupting the stream.
+    """
+
+    @dataclass
+    class LargeDatum(PlutusData):
+        CONSTR_ID = 1
+        data: List[bytes]
+
+    hello = b"Hello world!"
+
+    # Exactly 24 items — the threshold where CBOR switches to 2-byte length
+    datum24 = LargeDatum(data=[hello] * 24)
+    restored24 = LargeDatum.from_cbor(datum24.to_cbor())
+    assert len(restored24.data) == 24
+    assert all(x == hello for x in restored24.data)
+
+    # 25 items — above the threshold
+    datum25 = LargeDatum(data=[hello] * 25)
+    restored25 = LargeDatum.from_cbor(datum25.to_cbor())
+    assert len(restored25.data) == 25
+    assert all(x == hello for x in restored25.data)
+
+
 def test_liqwid_tx():
     with open("test/resources/cbors/liqwid.json") as f:
         cbor_hex = json.load(f).get("cborHex")