code-kern-ai
diff --git a/‎.cursor/rules/api-models.mdc‎
Lines changed: 85 additions & 0 deletions b/‎.cursor/rules/api-models.mdc‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎.cursor/rules/controllers.mdc‎
Lines changed: 155 additions & 0 deletions b/‎.cursor/rules/controllers.mdc‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎.cursor/rules/exceptions.mdc‎
Lines changed: 84 additions & 0 deletions b/‎.cursor/rules/exceptions.mdc‎
Lines changed: 84 additions & 0 deletions
@@ -0,0 +1,85 @@
+---
+description: Rules for Pydantic models and request/response validation
+globs: ["src/data/data_type.py"]
+alwaysApply: true
+---
+
+# API Models Guidelines
+
+Pydantic models validate request bodies and ensure type safety. Models are defined in `src/data/data_type.py`.
+
+## Model Definition
+
+**Basic structure:**
+```python
+from typing import Dict, List, Any
+from pydantic import BaseModel
+
+class EmbeddingRequest(BaseModel):
+    project_id: str
+    embedding_id: str
+
+class EmbeddingRebuildRequest(BaseModel):
+    # example request structure:
+    # {"<embedding_id>":[{"record_id":"<record_id>","attribute_name":"<attribute_name>","sub_key":<sub_key>}]}
+    # note that sub_key is optional and only for embedding lists relevant
+    # also sub_key is an int but converted to string in the request
+    changes: Dict[str, List[Dict[str, Any]]]
+
+class EmbeddingCalcTensorByPkl(BaseModel):
+    texts: List[str]
+```
+
+## Naming Conventions
+
+- Request models: `EmbeddingRequest`, `EmbeddingRebuildRequest`, `EmbeddingCalcTensorByPkl`
+- Use descriptive names that indicate the operation and data type
+- Use `Request` suffix for request body models
+
+## Usage in Routes
+
+```python
+from src.data import data_type
+
+@app.post("/embed")
+def embed(request: data_type.EmbeddingRequest) -> responses.PlainTextResponse:
+    status_code = controller.manage_encoding_thread(
+        request.project_id, request.embedding_id
+    )
+    return responses.PlainTextResponse(status_code=status_code)
+
+@app.post("/re_embed_records/{project_id}")
+def re_embed_record(
+    project_id: str, 
+    request: data_type.EmbeddingRebuildRequest
+) -> responses.PlainTextResponse:
+    controller.re_embed_records(project_id, request.changes)
+    return responses.PlainTextResponse(status_code=status.HTTP_200_OK)
+```
+
+## Field Validation
+
+```python
+from pydantic import field_validator, Field
+
+class EmbeddingRequest(BaseModel):
+    project_id: str = Field(min_length=1)
+    embedding_id: str = Field(min_length=1)
+    
+    @field_validator('project_id', 'embedding_id')
+    @classmethod
+    def validate_ids(cls, v):
+        if not v or not v.strip():
+            raise ValueError('ID cannot be empty')
+        return v.strip()
+```
+
+## Best Practices
+
+1. Use standard Python types (`str`, `int`, `List`, `Dict`) - Pydantic handles validation
+2. Provide defaults for optional fields using `Optional[Type] = None`
+3. Use descriptive model names that indicate purpose
+4. Document complex nested structures with comments
+5. Use proper type hints for all fields
+6. Keep models focused on request/response data structure
+7. Use `Dict[str, Any]` for flexible nested structures when needed
@@ -0,0 +1,155 @@
+---
+description: Rules for controller module and business logic
+globs: ["controller.py"]
+alwaysApply: true
+---
+
+# Controllers Guidelines
+
+The controller module (`controller.py`) contains business logic for embedding operations and orchestrates interactions between routes, submodules, embedders, and external services.
+
+## Import Patterns
+
+```python
+# Submodules
+from submodules.model.business_objects import (
+    attribute,
+    embedding,
+    general,
+    project,
+    record,
+    tokenization,
+    notification,
+    organization,
+)
+from submodules.model import enums, daemon
+from submodules.s3 import controller as s3
+
+# Embedders
+from src.embedders import Transformer, util
+from src.embedders.classification.contextual import (
+    OpenAISentenceEmbedder,
+    HuggingFaceSentenceEmbedder,
+)
+from src.util import request_util
+from src.util.decorator import param_throttle
+from src.util.embedders import get_embedder
+from src.util.notification import send_project_update
+```
+
+## Function Patterns
+
+**Async embedding operations:**
+```python
+from submodules.model import daemon
+from fastapi import status
+
+def manage_encoding_thread(project_id: str, embedding_id: str) -> int:
+    daemon.run_without_db_token(prepare_run, project_id, embedding_id)
+    return status.HTTP_200_OK
+```
+
+**Embedding lifecycle:**
+```python
+def delete_embedding(project_id: str, embedding_id: str) -> int:
+    object_name = f"embedding_tensors_{embedding_id}.csv.bz2"
+    org_id = organization.get_id_by_project_id(project_id)
+    s3.delete_object(org_id, f"{project_id}/{object_name}")
+    request_util.delete_embedding_from_neural_search(embedding_id)
+    json_path = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json"
+    json_path.unlink(missing_ok=True)
+    return status.HTTP_200_OK
+```
+
+**Embedding state management:**
+```python
+def run_encoding(project_id: str, user_id: str, embedding_id: str, ...) -> int:
+    session_token = general.get_ctx_token()
+    try:
+        # Update embedding state
+        embedding.update_embedding_state_encoding(project_id, embedding_id, with_commit=True)
+        send_project_update(project_id, f"embedding:{embedding_id}:state:{enums.EmbeddingState.ENCODING.value}")
+        
+        # Process batches
+        for pair in generate_batches(...):
+            embedding.create_tensors(project_id, embedding_id, record_ids_batched, tensors, with_commit=True)
+            send_progress_update_throttle(project_id, embedding_id, state, initial_count)
+        
+        # Finalize
+        embedding.update_embedding_state_finished(project_id, embedding_id, with_commit=True)
+    finally:
+        general.remove_and_refresh_session(session_token)
+    return status.HTTP_200_OK
+```
+
+## Business Logic Patterns
+
+**Batch processing:**
+```python
+def generate_batches(
+    project_id: str,
+    record_ids: List[str],
+    embedding_type: str,
+    attribute_values_raw: List[str],
+    embedder: Transformer,
+    attribute_name: str,
+    for_delta: bool = False,
+) -> Iterator[Dict[List[str], List[Any]]]:
+    # Process records in batches using embedder.batch_size
+    # Yield batches of record_ids and embeddings
+    pass
+```
+
+**Session management:**
+```python
+def prepare_run(project_id: str, embedding_id: str) -> None:
+    session_token = general.get_ctx_token()
+    try:
+        t = __prepare_encoding(project_id, embedding_id)
+    finally:
+        general.remove_and_refresh_session(session_token)
+    if t:
+        run_encoding(*t)
+```
+
+**Error handling with notifications:**
+```python
+try:
+    # Embedding operation
+    pass
+except Exception as e:
+    embedding.update_embedding_state_failed(project_id, embedding_id, with_commit=True)
+    send_project_update(project_id, f"embedding:{embedding_id}:state:{enums.EmbeddingState.FAILED.value}")
+    notification.create(
+        project_id,
+        user_id,
+        str(e),
+        enums.Notification.ERROR.value,
+        enums.NotificationType.EMBEDDING_CREATION_FAILED.value,
+        True,
+    )
+    return status.HTTP_500_INTERNAL_SERVER_ERROR
+```
+
+**Throttled progress updates:**
+```python
+@param_throttle(seconds=5)
+def send_progress_update_throttle(
+    project_id: str, embedding_id: str, state: str, initial_count: int
+) -> None:
+    progress = resolve_progress(embedding_id, state, initial_count)
+    send_project_update(project_id, f"embedding:{embedding_id}:progress:{progress}")
+```
+
+## Best Practices
+
+1. Single responsibility per function
+2. Always validate inputs and check embedding existence
+3. Use type hints for all parameters
+4. Use `with_commit=True` when modifying database state
+5. Use submodule business objects, never SQLAlchemy directly
+6. Manage database sessions with `general.get_ctx_token()` and `general.remove_and_refresh_session()`
+7. Use `daemon.run_without_db_token()` for background operations
+8. Update embedding state and send project updates for progress tracking
+9. Clean up resources (delete embedders, call gc.collect()) after operations
+10. Handle errors gracefully with appropriate notifications and state updates
@@ -0,0 +1,84 @@
+---
+description: Rules for exception handling and custom exceptions
+globs: ["**/*.py"]
+alwaysApply: true
+---
+
+# Exceptions Guidelines
+
+## Exception Locations
+
+**Submodule exceptions:**
+```python
+from submodules.model.exceptions import EntityNotFoundException, EntityAlreadyExistsException
+```
+
+**Standard Python exceptions:**
+- `ValueError` - Invalid input values
+- `Exception` - General errors (with specific messages)
+
+## Usage Patterns
+
+**Raising exceptions:**
+```python
+# Validation
+if not embedding.get(project_id, embedding_id):
+    raise ValueError(f"Embedding {embedding_id} not found in project {project_id}")
+
+# Not found (from submodules)
+embedding_item = embedding.get(project_id, embedding_id)
+if not embedding_item:
+    # Handle gracefully - return early or raise
+    return
+
+# Business logic errors
+if not embedder:
+    raise Exception(
+        f"couldn't find matching embedder for requested embedding with type {embedding_type} model {model} and platform {platform}"
+    )
+```
+
+**Handling in controllers:**
+```python
+try:
+    embedder = get_embedder(...)
+    if not embedder:
+        raise Exception("Could not initialize embedder")
+except Exception as e:
+    print(traceback.format_exc(), flush=True)
+    embedding.update_embedding_state_failed(project_id, embedding_id, with_commit=True)
+    send_project_update(project_id, f"embedding:{embedding_id}:state:{enums.EmbeddingState.FAILED.value}")
+    notification.create(...)
+    return status.HTTP_422_UNPROCESSABLE_ENTITY
+```
+
+**Handling in routes:**
+```python
+@app.post("/calc-tensor-by-pkl/{project_id}/{embedding_id}")
+def calc_tensor(...):
+    if tensor := controller.calc_tensors(project_id, embedding_id, request.texts):
+        return responses.JSONResponse(status_code=status.HTTP_200_OK, content={"tensor": tensor})
+    return responses.PlainTextResponse(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        content="Error while calculating tensor",
+    )
+```
+
+## HTTP Status Code Mapping
+
+- `200`: Successful operations
+- `422`: `UnprocessableEntity` - Invalid input or model initialization failures
+- `500`: `InternalServerError` - Runtime errors, API connection errors, general exceptions
+
+## Error Handling Best Practices
+
+1. Use specific exception types when available from submodules
+2. Provide clear error messages with context (project_id, embedding_id, etc.)
+3. Log exceptions with `print(traceback.format_exc(), flush=True)` for debugging
+4. Update embedding state to `FAILED` when errors occur
+5. Send project updates to notify users of failures
+6. Create notifications for user-facing errors
+7. Return appropriate HTTP status codes from routes
+8. Clean up resources (sessions, embedders) in `finally` blocks
+9. Don't swallow exceptions silently - always handle or propagate
+10. Use early returns for validation failures to avoid deep nesting