feat: add streaming support

VGSML · VGSML · commit 22b581028f52 · 2025-07-25T21:03:27.000+02:00
diff --git a/Makefile b/Makefile
@@ -1,6 +1,22 @@
 venv:
 	uv venv
 	source .venv/bin/activate && uv pip install -e .
-
 run:
-	source .venv/bin/activate && python example.py
+	source .venv/bin/activate && python example.py
+
+clean:
+	rm -rf .venv
+	rm -rf __pycache__
+	rm -rf *.pyc
+	rm -rf *.pyo
+	rm -rf *.pyd
+	rm -rf *.egg-info
+
+lab:
+	source .venv/bin/activate && \
+		uv pip install jupyterlab ipykernel && \
+		uv pip install jupyterlab-lsp python-lsp-server[all] && \
+		uv pip install python-lsp-server[rope] pylsp-mypy pylsp-rope && \
+		uv pip install -e . && \
+		python -m ipykernel install --user --name=venv --display-name="Python (hugr)" && \
+		jupyter lab --no-browser --port=8888
diff --git a/README.md b/README.md
@@ -131,6 +131,123 @@ hugr.query(
 )
 ```
 
+## Streaming API
+
+In addition to standard HTTP queries, `hugr-client` supports asynchronous streaming of data via WebSocket. This allows you to receive large datasets in batches or row-by-row, without waiting for the entire result to be loaded into memory.
+
+### Quick Start
+
+```python
+import asyncio
+from hugr.stream import connect_stream
+
+async def main():
+    client = connect_stream("http://localhost:15001/ipc")
+
+    # HTTP query for total count
+    result = client.query("query { devices_aggregation { _rows_count } }")
+    print("Total devices:", result.record()['_rows_count'])
+
+    # Stream data in batches (Arrow RecordBatch)
+    async with await client.stream(
+        """
+        query {
+            devices {
+                id
+                name
+                geom
+            }
+        }
+        """
+    ) as stream:
+        async for batch in stream.chunks():
+            df = batch.to_pandas()
+            print("Batch:", len(df), "rows")
+
+    # Stream data row by row
+    async with await client.stream(
+        "query { devices { id name status } }"
+    ) as stream:
+        async for row in stream.rows():
+            print(row)
+
+asyncio.run(main())
+```
+
+### Main Features
+
+- **connect_stream** — create a streaming client (WebSocket).
+- **client.stream(query, variables=None)** — asynchronously get a stream of Arrow RecordBatch for a GraphQL query.
+- **stream.chunks()** — async generator for batches (RecordBatch).
+- **stream.rows()** — async generator for rows (dict).
+- **stream.to_pandas()** — collect all streamed data into a pandas.DataFrame.
+- **stream.count()** — count the number of rows in the stream.
+- **stream_data_object(data_object, fields, variables=None)** — stream a specific data object and fields.
+
+### Example: Collect DataFrame via Streaming
+
+```python
+import asyncio
+from hugr.stream import connect_stream
+
+async def main():
+    client = connect_stream("http://localhost:15001/ipc")
+    async with await client.stream(
+        "query { devices { id name geom } }"
+    ) as stream:
+        df = await stream.to_pandas()
+        print(df.head())
+
+asyncio.run(main())
+```
+
+### Example: Row-by-row Processing
+
+```python
+import asyncio
+from hugr.stream import connect_stream
+
+async def main():
+    client = connect_stream()
+    async with await client.stream(
+        "query { devices { id name status } }"
+    ) as stream:
+        async for row in stream.rows():
+            if row.get("status") == "active":
+                print("Active device:", row["name"])
+
+asyncio.run(main())
+```
+
+### Example: Query Cancellation
+
+```python
+import asyncio
+from hugr.stream import connect_stream
+
+async def main():
+    client = connect_stream()
+    async with await client.stream(
+        "query { devices { id name } }"
+    ) as stream:
+        count = 0
+        async for batch in stream.chunks():
+            count += batch.num_rows
+            if count > 1000:
+                await client.cancel_current_query()
+                break
+
+asyncio.run(main())
+```
+
+### Notes
+
+- All streaming functions are asynchronous and require `async`/`await`.
+- Dependencies: `websockets`, `pyarrow`, `pandas`.
+- You can use both a pure streaming client and an enhanced client with HTTP and WebSocket support.
+
+See more in [hugr/stream.py](hugr/stream.py) and the code examples in the source files.
+
 ## License
 
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
diff --git a/hugr/__init__.py b/hugr/__init__.py
@@ -8,6 +8,14 @@
     explore_map,
 )
 
+from .stream import (
+    HugrStreamConnection,
+    HugrStreamingClient,
+    HugrStream,
+    connect_stream,
+    new_stream_connection,
+)
+
 __all__ = [
     "HugrClient",
     "HugrIPCResponse",
@@ -16,6 +24,11 @@
     "connect",
     "query",
     "explore_map",
+    "HugrStreamConnection",
+    "HugrStreamingClient",
+    "HugrStream",
+    "connect_stream",
+    "new_stream_connection",
 ]
 
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/hugr/client.py b/hugr/client.py
@@ -40,6 +40,9 @@ def __init__(
             for field, fi in geom_fields.items():
                 encoding = fi.get("format", "wkb").lower()
                 if len(field.split(".")) == 1:
+                    if encoding == "h3cell":
+                        # H3 cells are stored as strings, no decoding needed
+                        continue
                     self._df[field] = self._df[field].apply(
                         lambda x: _decode_geom(x, encoding)
                     )
@@ -357,6 +360,8 @@ def _decode_geom(val, fmt):
         return None
     if isinstance(val, BaseGeometry):
         return val
+    if fmt == "h3cell":
+        return val
     if fmt == "wkb":
         return wkb.loads(val)
     elif fmt == "geojson":
@@ -372,6 +377,8 @@ def _encode_geojson(val, fmt):
         return None
     if isinstance(val, BaseGeometry):
         return mapping(val)
+    if fmt == "h3cell":
+        return val
     if fmt == "wkb":
         return mapping(wkb.load(val))
     elif fmt == "geojson":
diff --git a/hugr/stream.py b/hugr/stream.py
diff --git a/pyproject.toml b/pyproject.toml