Add docker compose and llama-cpp-server (#21)

richardr1126 · web-flow · commit 21ec3c620e26 · 2025-04-12T13:49:41.000+10:00
* Add GPU Dockerfile and all-in-one llama-cpp docker compose w/ model downloader (fixed)

* Sets `.env` file in docker compose + uses env values for llama-cpp-server
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,3 @@
+*.env
+models/
+*.gguf
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+.env
+__pycache__/
+.venv/
+venv/
+models/
+*.gguf
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -0,0 +1,47 @@
+FROM ubuntu:22.04
+
+# Set non-interactive frontend
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    python3-venv \
+    libsndfile1 \
+    ffmpeg \
+    portaudio19-dev \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user and set up directories
+RUN useradd -m -u 1001 appuser && \
+    mkdir -p /app/outputs /app && \
+    chown -R appuser:appuser /app
+
+USER appuser
+WORKDIR /app
+
+# Copy dependency files
+COPY --chown=appuser:appuser requirements.txt ./requirements.txt
+
+# Create and activate virtual environment
+RUN python3 -m venv /app/venv
+ENV PATH="/app/venv/bin:$PATH"
+
+# Install PyTorch with CUDA support and other dependencies
+RUN pip3 install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 && \
+    pip3 install --no-cache-dir -r requirements.txt
+
+# Copy project files
+COPY --chown=appuser:appuser . .
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app \
+    USE_GPU=true
+
+# Expose the port
+EXPOSE 5005
+
+# Run FastAPI server with uvicorn
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5005", "--workers", "1"]
diff --git a/README.md b/README.md
@@ -57,6 +57,8 @@ Listen to sample outputs with different voices and emotions:
 ```
 Orpheus-FastAPI/
 ├── app.py                # FastAPI server and endpoints
+├── docker-compose.yml    # Docker compose configuration
+├── Dockerfile.gpu        # GPU-enabled Docker image
 ├── requirements.txt      # Dependencies
 ├── static/               # Static assets (favicon, etc.)
 ├── outputs/              # Generated audio files
@@ -74,9 +76,21 @@ Orpheus-FastAPI/
 
 - Python 3.8-3.11 (Python 3.12 is not supported due to removal of pkgutil.ImpImporter)
 - CUDA-compatible GPU (recommended: RTX series for best performance)
-- Separate LLM inference server running the Orpheus model (e.g., LM Studio or llama.cpp server)
+- Using docker compose or separate LLM inference server running the Orpheus model (e.g., LM Studio or llama.cpp server)
 
-### Installation
+### 🐳 Docker compose
+
+The docker compose file orchestrates the Orpheus-FastAPI for audio and a llama.cpp inference server for the base model token generation. The GGUF model is downloaded with the model-init service.
+
+```bash
+cp .env.example .env # Nothing needs to be changed, but the file is required
+```
+
+```bash
+docker compose up --build
+```
+
+### FastAPI Service Native Installation
 
 1. Clone the repository:
 ```bash
@@ -271,7 +285,7 @@ You can easily integrate this TTS solution with [OpenWebUI](https://github.com/o
 
 ### External Inference Server
 
-This application requires a separate LLM inference server running the Orpheus model. You can use:
+This application requires a separate LLM inference server running the Orpheus model. For easy setup, use Docker Compose, which automatically handles this for you. Alternatively, you can use:
 
 - [GPUStack](https://github.com/gpustack/gpustack) - GPU optimised LLM inference server (My pick) - supports LAN/WAN tensor split parallelisation
 - [LM Studio](https://lmstudio.ai/) - Load the GGUF model and start the local server
@@ -291,16 +305,17 @@ The inference server should be configured to expose an API endpoint that this Fa
 
 ### Environment Variables
 
-You can configure the system using environment variables or a `.env` file:
+Configure in docker compose, if using docker. Not using docker; create a `.env` file:
 
-- `ORPHEUS_API_URL`: URL of the LLM inference API (tts_engine/inference.py)
+- `ORPHEUS_API_URL`: URL of the LLM inference API (default in Docker: http://llama-cpp-server:5006/v1/completions)
 - `ORPHEUS_API_TIMEOUT`: Timeout in seconds for API requests (default: 120)
 - `ORPHEUS_MAX_TOKENS`: Maximum tokens to generate (default: 8192)
 - `ORPHEUS_TEMPERATURE`: Temperature for generation (default: 0.6)
 - `ORPHEUS_TOP_P`: Top-p sampling parameter (default: 0.9)
 - `ORPHEUS_SAMPLE_RATE`: Audio sample rate in Hz (default: 24000)
 - `ORPHEUS_PORT`: Web server port (default: 5005)
 - `ORPHEUS_HOST`: Web server host (default: 0.0.0.0)
+- `ORPHEUS_MODEL_NAME`: Model name for inference server
 
 The system now supports loading environment variables from a `.env` file in the project root, making it easier to configure without modifying system-wide environment settings. See `.env.example` for a template.
 
diff --git a/app.py b/app.py
@@ -11,14 +11,30 @@
 
 # Function to ensure .env file exists
 def ensure_env_file_exists():
-    """Create a default .env file if one doesn't exist"""
+    """Create a .env file from defaults and OS environment variables"""
     if not os.path.exists(".env") and os.path.exists(".env.example"):
         try:
-            # Copy .env.example to .env
+            # 1. Create default env dictionary from .env.example
+            default_env = {}
             with open(".env.example", "r") as example_file:
-                with open(".env", "w") as env_file:
-                    env_file.write(example_file.read())
-            print("✅ Created default configuration file at .env")
+                for line in example_file:
+                    line = line.strip()
+                    if line and not line.startswith("#") and "=" in line:
+                        key = line.split("=")[0].strip()
+                        default_env[key] = line.split("=", 1)[1].strip()
+
+            # 2. Override defaults with Docker environment variables if they exist
+            final_env = default_env.copy()
+            for key in default_env:
+                if key in os.environ:
+                    final_env[key] = os.environ[key]
+
+            # 3. Write dictionary to .env file in env format
+            with open(".env", "w") as env_file:
+                for key, value in final_env.items():
+                    env_file.write(f"{key}={value}\n")
+                    
+            print("✅ Created default .env file from .env.example and environment variables.")
         except Exception as e:
             print(f"⚠️ Error creating default .env file: {e}")
 
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,68 @@
+services:
+  orpheus-fastapi:
+    container_name: orpheus-fastapi
+    build:
+      context: .
+      dockerfile: Dockerfile.gpu
+    ports:
+      - "5005:5005"
+    env_file:
+      - .env
+    environment:
+      - ORPHEUS_API_URL=http://llama-cpp-server:5006/v1/completions
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+    depends_on:
+      llama-cpp-server:
+        condition: service_started
+
+  llama-cpp-server:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    ports:
+      - "5006:5006"
+    volumes:
+      - ./models:/models
+    env_file:
+      - .env
+    depends_on:
+      model-init:
+        condition: service_completed_successfully
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+    command: >
+      -m /models/${ORPHEUS_MODEL_NAME}
+      --port 5006 
+      --host 0.0.0.0 
+      --n-gpu-layers 29
+      --ctx-size ${ORPHEUS_MAX_TOKENS}
+      --n-predict ${ORPHEUS_MAX_TOKENS}
+      --rope-scaling linear
+
+  model-init:
+    image: curlimages/curl:latest
+    user: ${UID}:${GID}
+    volumes:
+      - ./models:/app/models
+    working_dir: /app
+    command: >
+      sh -c '
+      if [ ! -f /app/models/${ORPHEUS_MODEL_NAME} ]; then
+        echo "Downloading model file..."
+        wget -P /app/models https://huggingface.co/lex-au/${ORPHEUS_MODEL_NAME}/resolve/main/${ORPHEUS_MODEL_NAME}
+      else
+        echo "Model file already exists"
+      fi'
+    restart: "no"
+    

-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +.env
 +__pycache__/
 +.venv/
 +venv/
 +models/
 +*.gguf