Skip to content

Commit 9ccdee2

Browse files
authored
Implement V2 API with ingestion and retrieval endpoints, including document management and chunk storage (#361)
1 parent a8ed3cf commit 9ccdee2

File tree

13 files changed

+2497
-29
lines changed

13 files changed

+2497
-29
lines changed

core/api.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
from core.routes.logs import router as logs_router # noqa: E402 – import after FastAPI app
5353
from core.routes.models import router as models_router
5454
from core.routes.usage import router as usage_router
55+
from core.routes.v2 import router as v2_router
5556
from core.services.telemetry import TelemetryService
5657
from core.services_init import document_service, ingestion_service
5758
from core.utils.folder_utils import normalize_folder_selector
@@ -326,6 +327,9 @@ def _extract_provider(model_name: str) -> str:
326327
# Register models router
327328
app.include_router(models_router)
328329

330+
# Register v2 router
331+
app.include_router(v2_router)
332+
329333
# Register logs router
330334
app.include_router(logs_router)
331335

core/app_factory.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ async def lifespan(app_instance: FastAPI):
2727
# ------------------------------------------------------------------
2828
# Import services directly from services_init instead of through api_module
2929
# ------------------------------------------------------------------
30-
from core.services_init import database, settings, vector_store
30+
from core.services_init import database, settings, v2_chunk_store, vector_store
3131

3232
# --- BEGIN MOVED STARTUP LOGIC ---
3333
logger.info("Lifespan: Initializing Database…")
@@ -57,6 +57,18 @@ async def lifespan(app_instance: FastAPI):
5757
exc_info=True,
5858
)
5959

60+
logger.info("Lifespan: Initializing V2 Chunk Store…")
61+
try:
62+
if hasattr(v2_chunk_store, "initialize"):
63+
await v2_chunk_store.initialize()
64+
logger.info("Lifespan: V2 Chunk Store initialization successful (or not applicable).")
65+
except Exception as exc: # noqa: BLE001
66+
logger.error(
67+
"Lifespan: CRITICAL - Failed to initialize V2 Chunk Store: %s",
68+
exc,
69+
exc_info=True,
70+
)
71+
6072
# Initialize ColPali vector store if it exists
6173
# Note: max_sim function creation happens in MultiVectorStore.initialize()
6274
logger.info("Lifespan: Initializing ColPali Vector Store…")

core/database/metadata_filters.py

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,16 @@ class InvalidMetadataFilterError(ValueError):
2929
class MetadataFilterBuilder:
3030
"""Translate JSON-style metadata filters into SQL, covering arrays, regex, and substring operators."""
3131

32-
_COLUMN_FIELDS = {
33-
"filename": "filename",
34-
}
32+
def __init__(
33+
self,
34+
*,
35+
metadata_column: str = "doc_metadata",
36+
metadata_types_column: Optional[str] = "metadata_types",
37+
column_fields: Optional[Dict[str, str]] = None,
38+
) -> None:
39+
self.metadata_column = metadata_column
40+
self.metadata_types_column = metadata_types_column
41+
self._column_fields = column_fields or {"filename": "filename"}
3542

3643
def build(self, filters: Optional[Dict[str, Any]]) -> str:
3744
"""Construct a SQL WHERE clause from a metadata filter dictionary."""
@@ -113,7 +120,7 @@ def _combine_clauses(self, clauses: List[str], operator: str, context: str) -> s
113120

114121
def _build_field_metadata_clause(self, field: str, value: Any) -> str:
115122
"""Build SQL clause for a single metadata field."""
116-
if field in self._COLUMN_FIELDS:
123+
if field in self._column_fields:
117124
return self._build_column_field_clause(field, value)
118125

119126
if isinstance(value, dict) and not any(key.startswith("$") for key in value):
@@ -192,7 +199,7 @@ def _build_single_value_clause(self, field: str, value: Any) -> str:
192199

193200
def _build_column_field_clause(self, field: str, value: Any) -> str:
194201
"""Build SQL clause for a reserved column field (e.g., filename)."""
195-
column = self._COLUMN_FIELDS[field]
202+
column = self._column_fields[field]
196203
builder = TextColumnFilterBuilder(column)
197204

198205
if isinstance(value, dict):
@@ -220,7 +227,7 @@ def _build_exists_clause(self, field: str, operand: Any) -> str:
220227
raise InvalidMetadataFilterError(f"$exists operator for field '{field}' expects a boolean value.")
221228

222229
field_key = self._escape_single_quotes(field)
223-
clause = f"(doc_metadata ? '{field_key}')"
230+
clause = f"({self.metadata_column} ? '{field_key}')"
224231
return clause if expected else f"(NOT {clause})"
225232

226233
def _build_comparison_clause(self, field: str, operator: str, operand: Any) -> str:
@@ -270,9 +277,9 @@ def _build_numeric_comparison_clause(self, field: str, sql_operator: str, operan
270277

271278
field_key = self._escape_single_quotes(field)
272279
type_expr = self._metadata_type_expr(field_key)
273-
# Use CASE to ensure casting only happens when type is correct
274280
value_expr = (
275-
f"(CASE WHEN {type_expr} = 'number' THEN (doc_metadata ->> '{field_key}')::double precision ELSE NULL END)"
281+
f"(CASE WHEN {type_expr} = 'number' THEN ({self.metadata_column} ->> '{field_key}')::double precision "
282+
"ELSE NULL END)"
276283
)
277284
return f"({value_expr} {sql_operator} {literal})"
278285

@@ -285,8 +292,10 @@ def _build_decimal_comparison_clause(self, field: str, sql_operator: str, operan
285292

286293
field_key = self._escape_single_quotes(field)
287294
type_expr = self._metadata_type_expr(field_key)
288-
# Use CASE to ensure casting only happens when type is correct
289-
value_expr = f"(CASE WHEN {type_expr} = 'decimal' THEN (doc_metadata ->> '{field_key}')::numeric ELSE NULL END)"
295+
value_expr = (
296+
f"(CASE WHEN {type_expr} = 'decimal' THEN ({self.metadata_column} ->> '{field_key}')::numeric "
297+
"ELSE NULL END)"
298+
)
290299
return f"({value_expr} {sql_operator} {literal}::numeric)"
291300

292301
def _build_datetime_comparison_clause(self, field: str, sql_operator: str, operand: Any) -> str:
@@ -298,9 +307,9 @@ def _build_datetime_comparison_clause(self, field: str, sql_operator: str, opera
298307

299308
field_key = self._escape_single_quotes(field)
300309
type_expr = self._metadata_type_expr(field_key)
301-
# Use CASE to ensure casting only happens when type is correct
302310
value_expr = (
303-
f"(CASE WHEN {type_expr} = 'datetime' THEN (doc_metadata ->> '{field_key}')::timestamptz ELSE NULL END)"
311+
f"(CASE WHEN {type_expr} = 'datetime' THEN ({self.metadata_column} ->> '{field_key}')::timestamptz "
312+
"ELSE NULL END)"
304313
)
305314
return f"({value_expr} {sql_operator} {literal})"
306315

@@ -313,16 +322,17 @@ def _build_date_comparison_clause(self, field: str, sql_operator: str, operand:
313322

314323
field_key = self._escape_single_quotes(field)
315324
type_expr = self._metadata_type_expr(field_key)
316-
# Use CASE to ensure casting only happens when type is correct
317-
value_expr = f"(CASE WHEN {type_expr} = 'date' THEN (doc_metadata ->> '{field_key}')::date ELSE NULL END)"
325+
value_expr = (
326+
f"(CASE WHEN {type_expr} = 'date' THEN ({self.metadata_column} ->> '{field_key}')::date " "ELSE NULL END)"
327+
)
318328
return f"({value_expr} {sql_operator} {literal})"
319329

320330
def _build_string_comparison_clause(self, field: str, sql_operator: str, operand: str) -> str:
321331
"""Build comparison clause for 'string' typed metadata (only for $eq/$ne)."""
322332
field_key = self._escape_single_quotes(field)
323333
escaped_value = self._escape_single_quotes(operand)
324334
type_expr = self._metadata_type_expr(field_key)
325-
value_expr = f"(doc_metadata ->> '{field_key}')"
335+
value_expr = f"({self.metadata_column} ->> '{field_key}')"
326336
# For strings without explicit type, assume string type (COALESCE handles missing metadata_types)
327337
return f"((COALESCE({type_expr}, 'string') = 'string') AND {value_expr} {sql_operator} '{escaped_value}')"
328338

@@ -345,8 +355,23 @@ def _build_type_clause(self, field: str, operand: Any) -> str:
345355
raise InvalidMetadataFilterError(str(exc)) from exc
346356

347357
field_key = self._escape_single_quotes(field)
348-
type_expr = f"COALESCE(metadata_types ->> '{field_key}', 'string')"
349-
clauses = [f"({type_expr} = '{type_name}')" for type_name in canonical_types]
358+
if not self.metadata_types_column:
359+
jsonb_type_expr = f"jsonb_typeof({self.metadata_column} -> '{field_key}')"
360+
type_map = {
361+
"string": "string",
362+
"number": "number",
363+
"decimal": "number",
364+
"boolean": "boolean",
365+
"object": "object",
366+
"array": "array",
367+
"null": "null",
368+
"datetime": "string",
369+
"date": "string",
370+
}
371+
clauses = [f"({jsonb_type_expr} = '{type_map.get(type_name, type_name)}')" for type_name in canonical_types]
372+
else:
373+
type_expr = f"COALESCE({self.metadata_types_column} ->> '{field_key}', 'string')"
374+
clauses = [f"({type_expr} = '{type_name}')" for type_name in canonical_types]
350375
if len(clauses) == 1:
351376
return clauses[0]
352377
return "(" + " OR ".join(clauses) + ")"
@@ -367,7 +392,7 @@ def _jsonb_contains_clause(self, field: str, value: Any) -> str:
367392
) from exc
368393

369394
escaped_payload = json_payload.replace("'", "''")
370-
base_clause = f"(doc_metadata @> '{escaped_payload}'::jsonb)"
395+
base_clause = f"({self.metadata_column} @> '{escaped_payload}'::jsonb)"
371396

372397
array_clause = self._build_array_membership_clause(field, value)
373398
if array_clause:
@@ -391,8 +416,8 @@ def _build_array_membership_clause(self, field: str, value: Any) -> str:
391416
field_key = self._escape_single_quotes(field)
392417

393418
return (
394-
f"((jsonb_typeof(doc_metadata -> '{field_key}') = 'array') "
395-
f"AND ((doc_metadata -> '{field_key}') @> '{escaped_array_payload}'::jsonb))"
419+
f"((jsonb_typeof({self.metadata_column} -> '{field_key}') = 'array') "
420+
f"AND (({self.metadata_column} -> '{field_key}') @> '{escaped_array_payload}'::jsonb))"
396421
)
397422

398423
def _build_regex_clause(self, field: str, operand: Any) -> str:
@@ -403,7 +428,7 @@ def _build_regex_clause(self, field: str, operand: Any) -> str:
403428
escaped_pattern = pattern.replace("\\", "\\\\").replace("'", "''")
404429
field_key = self._escape_single_quotes(field)
405430

406-
base_clause = f"((doc_metadata ->> '{field_key}') {regex_operator} '{escaped_pattern}')"
431+
base_clause = f"(({self.metadata_column} ->> '{field_key}') {regex_operator} '{escaped_pattern}')"
407432
array_clause = self._build_array_regex_clause(field, regex_operator, escaped_pattern)
408433
if array_clause:
409434
return f"({base_clause} OR {array_clause})"
@@ -440,8 +465,8 @@ def _build_array_regex_clause(self, field: str, regex_operator: str, escaped_pat
440465
field_key = self._escape_single_quotes(field)
441466
array_value_expr = "trim('\"' FROM arr.value::text)"
442467
return (
443-
f"((jsonb_typeof(doc_metadata -> '{field_key}') = 'array') AND EXISTS ("
444-
f"SELECT 1 FROM jsonb_array_elements(doc_metadata -> '{field_key}') AS arr(value) "
468+
f"((jsonb_typeof({self.metadata_column} -> '{field_key}') = 'array') AND EXISTS ("
469+
f"SELECT 1 FROM jsonb_array_elements({self.metadata_column} -> '{field_key}') AS arr(value) "
445470
f"WHERE jsonb_typeof(arr.value) = 'string' AND {array_value_expr} {regex_operator} '{escaped_pattern}'))"
446471
)
447472

@@ -453,7 +478,7 @@ def _build_contains_clause(self, field: str, operand: Any) -> str:
453478
escaped_pattern = self._escape_like_pattern(value)
454479
field_key = self._escape_single_quotes(field)
455480

456-
base_clause = f"((doc_metadata ->> '{field_key}') {like_operator} '%{escaped_pattern}%')"
481+
base_clause = f"(({self.metadata_column} ->> '{field_key}') {like_operator} '%{escaped_pattern}%')"
457482
array_clause = self._build_array_like_clause(field, like_operator, escaped_pattern)
458483
if array_clause:
459484
return f"({base_clause} OR {array_clause})"
@@ -492,15 +517,17 @@ def _build_array_like_clause(self, field: str, like_operator: str, escaped_patte
492517
field_key = self._escape_single_quotes(field)
493518
array_value_expr = "trim('\"' FROM arr.value::text)"
494519
return (
495-
f"((jsonb_typeof(doc_metadata -> '{field_key}') = 'array') AND EXISTS ("
496-
f"SELECT 1 FROM jsonb_array_elements(doc_metadata -> '{field_key}') AS arr(value) "
520+
f"((jsonb_typeof({self.metadata_column} -> '{field_key}') = 'array') AND EXISTS ("
521+
f"SELECT 1 FROM jsonb_array_elements({self.metadata_column} -> '{field_key}') AS arr(value) "
497522
f"WHERE jsonb_typeof(arr.value) = 'string' AND "
498523
f"{array_value_expr} {like_operator} '%{escaped_pattern}%'))"
499524
)
500525

501526
def _metadata_type_expr(self, field_key: str) -> str:
502527
"""Return SQL expression fetching the stored metadata type for a field."""
503-
return f"(metadata_types ->> '{field_key}')"
528+
if not self.metadata_types_column:
529+
return "NULL"
530+
return f"({self.metadata_types_column} ->> '{field_key}')"
504531

505532
def _map_comparison_operator(self, operator: str) -> str:
506533
"""Map comparison operators to SQL symbols."""

core/models/v2.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from typing import Any, Dict, List, Optional
2+
3+
from pydantic import BaseModel, Field, model_validator
4+
5+
6+
class V2IngestResponse(BaseModel):
7+
document_id: str
8+
filename: str
9+
chunk_count: int
10+
status: Optional[str] = None
11+
12+
13+
class V2RetrieveFilters(BaseModel):
14+
document_ids: Optional[List[str]] = Field(default=None, description="Limit to specific document IDs")
15+
folder_paths: Optional[List[str]] = Field(default=None, description="Limit to specific folder paths")
16+
metadata: Optional[Dict[str, Any]] = Field(default=None, description="Chunk-level metadata filters")
17+
18+
19+
class V2RetrieveRequest(BaseModel):
20+
query: str
21+
filters: Optional[V2RetrieveFilters] = None
22+
top_k: int = Field(default=5, ge=1, le=100)
23+
end_user_id: Optional[str] = Field(default=None, description="Optional end-user scope")
24+
25+
@model_validator(mode="after")
26+
def validate_query(self):
27+
if not self.query or not self.query.strip():
28+
raise ValueError("query must be a non-empty string")
29+
return self
30+
31+
32+
class V2ChunkResult(BaseModel):
33+
chunk_id: str
34+
document_id: str
35+
page_number: Optional[int] = None
36+
chunk_number: Optional[int] = None
37+
score: float
38+
content: str
39+
40+
41+
class V2RetrieveResponse(BaseModel):
42+
query: str
43+
chunks: List[V2ChunkResult]

0 commit comments

Comments
 (0)