diff --git a/.env.example b/.env.example index 74774b9..3931cd2 100644 --- a/.env.example +++ b/.env.example @@ -6,9 +6,6 @@ # REQUIRED - You must set these before running # ============================================================================= -# Anthropic API key for Claude models (get from console.anthropic.com) -ANTHROPIC_API_KEY=sk-ant-... - # Convex deployment URL (from your Convex dashboard) # Apps reference this as NEXT_PUBLIC_CONVEX_URL CONVEX_URL=https://your-deployment.convex.cloud @@ -21,8 +18,8 @@ SQUADHUB_TOKEN=your-secure-token-here # OPTIONAL # ============================================================================= -# OpenAI API key (for image generation, optional) -# OPENAI_API_KEY=sk-... +# NOTE: API keys (Anthropic, OpenAI) are managed via the UI during onboarding +# and in Settings > General > API Keys. They are stored in the Convex tenant record. # Environment: dev or prod ENVIRONMENT=dev diff --git a/CLAUDE.md b/CLAUDE.md index 1640422..0710d09 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -54,7 +54,6 @@ Core models: `agents`, `tasks`, `messages` (see `packages/backend/convex/schema. ## Environment Variables - `NEXT_PUBLIC_CONVEX_URL`: Convex deployment URL (required) -- `ANTHROPIC_API_KEY`: Anthropic API key for AI operations (required, passed to squadhub) - `SQUADHUB_TOKEN`: Authentication token for squadhub gateway (required) - `SQUADHUB_URL`: SquadHub gateway URL (set in `.env.development` / `.env.production`) - `NODE_ENV`: local (`development`) vs deployed (`production`) — controls dev tooling diff --git a/README.md b/README.md index dfa3185..d7bae14 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,10 @@ Edit `.env`: ```bash # Required -ANTHROPIC_API_KEY=sk-ant-... SQUADHUB_TOKEN=your-secure-token CONVEX_URL=https://your-deployment.convex.cloud -# Optional -OPENAI_API_KEY=sk-... # For image generation +# API keys (Anthropic, OpenAI) are configured via the UI during onboarding ``` ### 3. Deploy Convex Backend @@ -281,9 +279,9 @@ pnpm convex:deploy ## Environment Variables -| Variable | Required | Description | -| ------------------- | -------- | --------------------------------- | -| `ANTHROPIC_API_KEY` | Yes | Anthropic API key for Claude | -| `SQUADHUB_TOKEN` | Yes | Auth token for squadhub gateway | -| `CONVEX_URL` | Yes | Convex deployment URL | -| `OPENAI_API_KEY` | No | OpenAI key (for image generation) | +| Variable | Required | Description | +| ---------------- | -------- | ------------------------------- | +| `SQUADHUB_TOKEN` | Yes | Auth token for squadhub gateway | +| `CONVEX_URL` | Yes | Convex deployment URL | + +API keys (Anthropic, OpenAI) are managed via the UI during onboarding and in Settings > General > API Keys. diff --git a/apps/web/CLAUDE.md b/apps/web/CLAUDE.md index 3620c8c..e6eb5fa 100644 --- a/apps/web/CLAUDE.md +++ b/apps/web/CLAUDE.md @@ -74,7 +74,7 @@ import type { Agent, Tenant } from "@clawe/backend/types"; **Environment variables:** - `NEXT_PUBLIC_CONVEX_URL` → Convex deployment URL (required) -- `ANTHROPIC_API_KEY` → Anthropic API key (passed to squadhub container) +- API keys (Anthropic, OpenAI) → managed via the UI, stored in Convex tenant record - `SQUADHUB_URL` → SquadHub gateway URL - `SQUADHUB_TOKEN` → SquadHub authentication token (from root `.env`) diff --git a/apps/web/src/app/(dashboard)/_components/nav-settings.tsx b/apps/web/src/app/(dashboard)/_components/nav-settings.tsx index 6051680..e163d79 100644 --- a/apps/web/src/app/(dashboard)/_components/nav-settings.tsx +++ b/apps/web/src/app/(dashboard)/_components/nav-settings.tsx @@ -7,6 +7,7 @@ import { Settings2, AlertTriangle, Globe, + KeyRound, Plug, } from "lucide-react"; import { @@ -25,6 +26,11 @@ const settingsItems = [ url: "/settings/general", icon: Settings2, }, + { + title: "API Keys", + url: "/settings/api-keys", + icon: KeyRound, + }, { title: "Business", url: "/settings/business", diff --git a/apps/web/src/app/(dashboard)/settings/api-keys/_components/api-keys-settings.tsx b/apps/web/src/app/(dashboard)/settings/api-keys/_components/api-keys-settings.tsx new file mode 100644 index 0000000..c34d5af --- /dev/null +++ b/apps/web/src/app/(dashboard)/settings/api-keys/_components/api-keys-settings.tsx @@ -0,0 +1,362 @@ +"use client"; + +import { useState, useEffect } from "react"; +import { useQuery, useMutation as useConvexMutation } from "convex/react"; +import { useMutation } from "@tanstack/react-query"; +import { api } from "@clawe/backend"; +import { Button } from "@clawe/ui/components/button"; +import { Input } from "@clawe/ui/components/input"; +import { Label } from "@clawe/ui/components/label"; +import { Spinner } from "@clawe/ui/components/spinner"; +import { Skeleton } from "@clawe/ui/components/skeleton"; +import { CheckCircle2, Eye, EyeOff, Pencil } from "lucide-react"; +import { loadPlugins, hasPlugin } from "@clawe/plugins"; +import { patchApiKeys } from "@/lib/squadhub/actions"; +import { useApiClient } from "@/hooks/use-api-client"; +import { config } from "@/lib/config"; + +interface KeyRowProps { + label: string; + maskedValue?: string; + required?: boolean; + placeholder: string; + isEditing: boolean; + onEdit: () => void; + onCancel: () => void; + inputValue: string; + onInputChange: (value: string) => void; + isValid: boolean | null; + isValidating: boolean; + validationError?: string; + onValidate: () => void; + onSave: () => void; + isSaving: boolean; + saveSuccess: boolean; + saveError?: string; + isCloud: boolean; +} + +const KeyRow = ({ + label, + maskedValue, + required, + placeholder, + isEditing, + onEdit, + onCancel, + inputValue, + onInputChange, + isValid, + isValidating, + validationError, + onValidate, + onSave, + isSaving, + saveSuccess, + saveError, + isCloud, +}: KeyRowProps) => { + const [showKey, setShowKey] = useState(false); + + if (!isEditing) { + return ( +
+
+
+

{label}

+ {required && !maskedValue && ( + Required + )} +
+ {maskedValue ? ( +

+ {maskedValue} +

+ ) : ( +

+ Not configured +

+ )} +
+ +
+ ); + } + + return ( +
+ +
+
+ onInputChange(e.target.value)} + className="pr-9" + autoFocus + /> + +
+ +
+ {validationError && ( +

{validationError}

+ )} + {isValid === true && ( +

+ Key validated +

+ )} +
+ + + {saveSuccess && ( +

+ {isCloud ? "Saved and applied" : "Saved and applied"} +

+ )} + {saveError &&

{saveError}

} +
+
+ ); +}; + +export const ApiKeysSettings = () => { + const apiKeys = useQuery(api.tenants.getApiKeys, {}); + const setApiKeysMutation = useConvexMutation(api.tenants.setApiKeys); + const apiClient = useApiClient(); + + const [editingAnthropic, setEditingAnthropic] = useState(false); + const [editingOpenai, setEditingOpenai] = useState(false); + const [anthropicKey, setAnthropicKey] = useState(""); + const [openaiKey, setOpenaiKey] = useState(""); + const [anthropicValid, setAnthropicValid] = useState(null); + const [openaiValid, setOpenaiValid] = useState(null); + const [isCloud, setIsCloud] = useState(config.isCloud); + + useEffect(() => { + loadPlugins().then(() => { + setIsCloud(hasPlugin()); + }); + }, []); + + const anthropicValidation = useMutation({ + mutationFn: async (apiKey: string) => { + const { data } = await apiClient.post<{ + valid: boolean; + error?: string; + }>("/api/tenant/validate-key", { provider: "anthropic", apiKey }); + if (!data.valid) throw new Error(data.error || "Invalid API key"); + return data; + }, + onSuccess: () => setAnthropicValid(true), + onError: () => setAnthropicValid(false), + }); + + const openaiValidation = useMutation({ + mutationFn: async (apiKey: string) => { + const { data } = await apiClient.post<{ + valid: boolean; + error?: string; + }>("/api/tenant/validate-key", { provider: "openai", apiKey }); + if (!data.valid) throw new Error(data.error || "Invalid API key"); + return data; + }, + onSuccess: () => setOpenaiValid(true), + onError: () => setOpenaiValid(false), + }); + + const anthropicSave = useMutation({ + mutationFn: async (key: string) => { + await setApiKeysMutation({ anthropicApiKey: key }); + await patchApiKeys(key); + if (isCloud) { + await apiClient.post("/api/tenant/squadhub/restart"); + } + }, + onSuccess: () => { + setEditingAnthropic(false); + setAnthropicKey(""); + setAnthropicValid(null); + anthropicValidation.reset(); + }, + }); + + const openaiSave = useMutation({ + mutationFn: async (key: string) => { + await setApiKeysMutation({ openaiApiKey: key }); + await patchApiKeys(undefined, key); + if (isCloud) { + await apiClient.post("/api/tenant/squadhub/restart"); + } + }, + onSuccess: () => { + setEditingOpenai(false); + setOpenaiKey(""); + setOpenaiValid(null); + openaiValidation.reset(); + }, + }); + + if (apiKeys === undefined) { + return ; + } + + return ( +
+
+

API Keys

+

+ Manage the API keys used by your AI agents. +

+
+ +
+ setEditingAnthropic(true)} + onCancel={() => { + setEditingAnthropic(false); + setAnthropicKey(""); + setAnthropicValid(null); + anthropicValidation.reset(); + }} + inputValue={anthropicKey} + onInputChange={(value) => { + setAnthropicKey(value); + setAnthropicValid(null); + anthropicValidation.reset(); + }} + isValid={anthropicValid} + isValidating={anthropicValidation.isPending} + validationError={ + anthropicValidation.isError + ? anthropicValidation.error.message + : undefined + } + onValidate={() => anthropicValidation.mutate(anthropicKey)} + onSave={() => anthropicSave.mutate(anthropicKey)} + isSaving={anthropicSave.isPending} + saveSuccess={anthropicSave.isSuccess} + saveError={ + anthropicSave.isError ? anthropicSave.error.message : undefined + } + isCloud={isCloud} + /> + + setEditingOpenai(true)} + onCancel={() => { + setEditingOpenai(false); + setOpenaiKey(""); + setOpenaiValid(null); + openaiValidation.reset(); + }} + inputValue={openaiKey} + onInputChange={(value) => { + setOpenaiKey(value); + setOpenaiValid(null); + openaiValidation.reset(); + }} + isValid={openaiValid} + isValidating={openaiValidation.isPending} + validationError={ + openaiValidation.isError + ? openaiValidation.error.message + : undefined + } + onValidate={() => openaiValidation.mutate(openaiKey)} + onSave={() => openaiSave.mutate(openaiKey)} + isSaving={openaiSave.isPending} + saveSuccess={openaiSave.isSuccess} + saveError={openaiSave.isError ? openaiSave.error.message : undefined} + isCloud={isCloud} + /> +
+
+ ); +}; + +const ApiKeysSettingsSkeleton = () => { + return ( +
+
+ + +
+
+ + +
+
+ ); +}; diff --git a/apps/web/src/app/(dashboard)/settings/api-keys/page.tsx b/apps/web/src/app/(dashboard)/settings/api-keys/page.tsx new file mode 100644 index 0000000..922f8ec --- /dev/null +++ b/apps/web/src/app/(dashboard)/settings/api-keys/page.tsx @@ -0,0 +1,26 @@ +"use client"; + +import { + PageHeader, + PageHeaderRow, + PageHeaderTitle, +} from "@dashboard/page-header"; +import { ApiKeysSettings } from "./_components/api-keys-settings"; + +const ApiKeysSettingsPage = () => { + return ( + <> + + + API Keys + + + +
+ +
+ + ); +}; + +export default ApiKeysSettingsPage; diff --git a/apps/web/src/app/api/tenant/provision/route.ts b/apps/web/src/app/api/tenant/provision/route.ts index f05ff3a..431c566 100644 --- a/apps/web/src/app/api/tenant/provision/route.ts +++ b/apps/web/src/app/api/tenant/provision/route.ts @@ -4,6 +4,7 @@ import { ConvexHttpClient } from "convex/browser"; import { api } from "@clawe/backend"; import { loadPlugins, getPlugin } from "@clawe/plugins"; import { setupTenant } from "@/lib/squadhub/setup"; +import { patchApiKeys } from "@/lib/squadhub/actions"; /** * POST /api/tenant/provision @@ -111,15 +112,24 @@ export const POST = async (request: NextRequest) => { ); } - // 5. Run app-level setup (agents, crons, routines) + // 5. Patch API keys into squadhub config const connection = { squadhubUrl: tenant.squadhubUrl, squadhubToken: tenant.squadhubToken, }; + if (tenant.anthropicApiKey) { + await patchApiKeys( + tenant.anthropicApiKey, + tenant.openaiApiKey ?? undefined, + connection, + ); + } + + // 6. Run app-level setup (agents, crons, routines) const result = await setupTenant(connection, convexUrl, authToken); - // 6. Return result + // 7. Return result return NextResponse.json({ ok: result.errors.length === 0, tenantId: tenant._id, diff --git a/apps/web/src/app/api/tenant/validate-key/route.ts b/apps/web/src/app/api/tenant/validate-key/route.ts new file mode 100644 index 0000000..f0b553f --- /dev/null +++ b/apps/web/src/app/api/tenant/validate-key/route.ts @@ -0,0 +1,56 @@ +import { NextResponse } from "next/server"; +import type { NextRequest } from "next/server"; + +/** + * POST /api/tenant/validate-key + * + * Basic format validation for API keys. + * Accepts both standard API keys and subscription tokens. + */ +export const POST = async (request: NextRequest) => { + try { + const body = (await request.json()) as { + provider: string; + apiKey: string; + }; + + const { provider, apiKey } = body; + + if (!provider || !apiKey) { + return NextResponse.json( + { valid: false, error: "Missing provider or apiKey" }, + { status: 400 }, + ); + } + + if (provider === "anthropic") { + const trimmed = apiKey.trim(); + if (trimmed.length < 10) { + return NextResponse.json({ + valid: false, + error: "Key is too short", + }); + } + return NextResponse.json({ valid: true }); + } + + if (provider === "openai") { + const trimmed = apiKey.trim(); + if (!trimmed.startsWith("sk-")) { + return NextResponse.json({ + valid: false, + error: "OpenAI keys start with sk-", + }); + } + return NextResponse.json({ valid: true }); + } + + return NextResponse.json( + { valid: false, error: `Unknown provider: ${provider}` }, + { status: 400 }, + ); + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return NextResponse.json({ valid: false, error: message }, { status: 500 }); + } +}; diff --git a/apps/web/src/app/auth/login/page.tsx b/apps/web/src/app/auth/login/page.tsx index ba88fce..e77e4cb 100644 --- a/apps/web/src/app/auth/login/page.tsx +++ b/apps/web/src/app/auth/login/page.tsx @@ -100,6 +100,7 @@ export default function LoginPage() { height={450} className="h-auto max-h-full w-auto max-w-full object-contain" priority + unoptimized /> diff --git a/apps/web/src/app/setup/api-keys/page.tsx b/apps/web/src/app/setup/api-keys/page.tsx new file mode 100644 index 0000000..3acd1f1 --- /dev/null +++ b/apps/web/src/app/setup/api-keys/page.tsx @@ -0,0 +1,273 @@ +"use client"; + +import { useState } from "react"; +import { useRouter } from "next/navigation"; +import { useMutation as useConvexMutation } from "convex/react"; +import { useMutation } from "@tanstack/react-query"; +import { api } from "@clawe/backend"; +import { Button } from "@clawe/ui/components/button"; +import { Input } from "@clawe/ui/components/input"; +import { Label } from "@clawe/ui/components/label"; +import { Progress } from "@clawe/ui/components/progress"; +import { Spinner } from "@clawe/ui/components/spinner"; +import { CheckCircle2 } from "lucide-react"; +import { patchApiKeys } from "@/lib/squadhub/actions"; +import { useApiClient } from "@/hooks/use-api-client"; + +const TOTAL_STEPS = 5; +const CURRENT_STEP = 2; + +export default function ApiKeysPage() { + const router = useRouter(); + const apiClient = useApiClient(); + const setApiKeys = useConvexMutation(api.tenants.setApiKeys); + + const [anthropicKey, setAnthropicKey] = useState(""); + const [openaiKey, setOpenaiKey] = useState(""); + const [anthropicValid, setAnthropicValid] = useState(false); + const [openaiValid, setOpenaiValid] = useState(null); + + // Validate Anthropic key + const anthropicValidation = useMutation({ + mutationFn: async (apiKey: string) => { + const { data } = await apiClient.post<{ valid: boolean; error?: string }>( + "/api/tenant/validate-key", + { provider: "anthropic", apiKey }, + ); + if (!data.valid) { + throw new Error(data.error || "Invalid API key"); + } + return data; + }, + onSuccess: () => { + setAnthropicValid(true); + }, + onError: () => { + setAnthropicValid(false); + }, + }); + + // Validate OpenAI key + const openaiValidation = useMutation({ + mutationFn: async (apiKey: string) => { + const { data } = await apiClient.post<{ valid: boolean; error?: string }>( + "/api/tenant/validate-key", + { provider: "openai", apiKey }, + ); + if (!data.valid) { + throw new Error(data.error || "Invalid API key"); + } + return data; + }, + onSuccess: () => { + setOpenaiValid(true); + }, + onError: () => { + setOpenaiValid(false); + }, + }); + + // Save keys to Convex and patch into squadhub config + const saveMutation = useMutation({ + mutationFn: async () => { + await setApiKeys({ + anthropicApiKey: anthropicKey, + openaiApiKey: openaiKey || undefined, + }); + await patchApiKeys(anthropicKey, openaiKey || undefined); + }, + onSuccess: () => { + router.push("/setup/business"); + }, + }); + + const handleValidateAnthropic = () => { + if (anthropicKey) { + anthropicValidation.mutate(anthropicKey); + } + }; + + const handleValidateOpenai = () => { + if (openaiKey) { + openaiValidation.mutate(openaiKey); + } + }; + + const handleSubmit = (e: React.FormEvent) => { + e.preventDefault(); + if (anthropicValid) { + saveMutation.mutate(); + } + }; + + const isSubmitting = saveMutation.isPending; + + return ( +
+
+ {/* Progress indicator */} +
+ +
+ +

+ API Keys +

+

+ Your AI agents need API keys to connect to language models. Keys are + stored securely and never leave your deployment. +

+ +
+ {/* Anthropic API Key (required) */} +
+ +
+ { + setAnthropicKey(e.target.value); + setAnthropicValid(false); + anthropicValidation.reset(); + }} + className="flex-1" + /> + +
+ {anthropicValidation.isError && ( +

+ {anthropicValidation.error.message} +

+ )} + {anthropicValid && ( +

+ API key is valid +

+ )} +

+ Get your key from{" "} + + console.anthropic.com + +

+
+ + {/* OpenAI API Key (optional) */} +
+ +
+ { + setOpenaiKey(e.target.value); + setOpenaiValid(null); + openaiValidation.reset(); + }} + className="flex-1" + /> + +
+ {openaiValidation.isError && ( +

+ {openaiValidation.error.message} +

+ )} + {openaiValid === true && ( +

+ API key is valid +

+ )} +

+ Enables image generation. You can add this later in Settings. +

+
+
+ + {saveMutation.isError && ( +

+ {saveMutation.error.message} +

+ )} +
+ + {/* CTA */} +
+ +
+
+ ); +} diff --git a/apps/web/src/app/setup/business/page.tsx b/apps/web/src/app/setup/business/page.tsx index 219bc59..00f7a0c 100644 --- a/apps/web/src/app/setup/business/page.tsx +++ b/apps/web/src/app/setup/business/page.tsx @@ -8,8 +8,8 @@ import { Progress } from "@clawe/ui/components/progress"; import { Chat } from "@/components/chat"; import { useAuth } from "@/providers/auth-provider"; -const TOTAL_STEPS = 4; -const CURRENT_STEP = 2; +const TOTAL_STEPS = 5; +const CURRENT_STEP = 3; // Session key for Clawe agent const CLAWE_SESSION_KEY = "agent:main:main"; diff --git a/apps/web/src/app/setup/complete/page.tsx b/apps/web/src/app/setup/complete/page.tsx index 69a0302..f98df27 100644 --- a/apps/web/src/app/setup/complete/page.tsx +++ b/apps/web/src/app/setup/complete/page.tsx @@ -9,8 +9,8 @@ import { Spinner } from "@clawe/ui/components/spinner"; import { CheckCircle2 } from "lucide-react"; import { api } from "@clawe/backend"; -const TOTAL_STEPS = 4; -const CURRENT_STEP = 4; +const TOTAL_STEPS = 5; +const CURRENT_STEP = 5; export default function CompletePage() { const router = useRouter(); diff --git a/apps/web/src/app/setup/layout.tsx b/apps/web/src/app/setup/layout.tsx index 279492c..2d0e9d6 100644 --- a/apps/web/src/app/setup/layout.tsx +++ b/apps/web/src/app/setup/layout.tsx @@ -31,6 +31,7 @@ const RightPanel = () => { height={450} className="h-auto max-h-full w-auto max-w-full object-contain" priority + unoptimized /> )} diff --git a/apps/web/src/app/setup/telegram/page.tsx b/apps/web/src/app/setup/telegram/page.tsx index d03ce19..3909dc9 100644 --- a/apps/web/src/app/setup/telegram/page.tsx +++ b/apps/web/src/app/setup/telegram/page.tsx @@ -32,8 +32,8 @@ import { import { SetupRightPanelContent } from "../_components/setup-right-panel"; import { DemoVideo } from "./_components/demo-video"; -const TOTAL_STEPS = 4; -const CURRENT_STEP = 3; +const TOTAL_STEPS = 5; +const CURRENT_STEP = 4; type Step = "token" | "pairing" | "success"; diff --git a/apps/web/src/app/setup/welcome/page.tsx b/apps/web/src/app/setup/welcome/page.tsx index 005f078..21cc3c3 100644 --- a/apps/web/src/app/setup/welcome/page.tsx +++ b/apps/web/src/app/setup/welcome/page.tsx @@ -3,22 +3,23 @@ import { useRouter } from "next/navigation"; import { Button } from "@clawe/ui/components/button"; import { Progress } from "@clawe/ui/components/progress"; -import { Globe, MessageCircle, AlertTriangle } from "lucide-react"; +import { Globe, Key, MessageCircle, AlertTriangle } from "lucide-react"; import { Tooltip, TooltipContent, TooltipTrigger, } from "@clawe/ui/components/tooltip"; import { useSquadhubStatus } from "@/hooks/use-squadhub-status"; +import { config } from "@/lib/config"; -const TOTAL_STEPS = 4; +const TOTAL_STEPS = 5; const CURRENT_STEP = 1; export default function WelcomePage() { const router = useRouter(); const { status, isLoading } = useSquadhubStatus(); - const isOffline = !isLoading && status === "down"; + const isOffline = !config.isCloud && !isLoading && status === "down"; return (
@@ -45,6 +46,22 @@ export default function WelcomePage() {

You'll need:

    +
  • +
    + +
    + + Your Anthropic API key from{" "} + + console.anthropic.com + + +
  • @@ -99,7 +116,7 @@ export default function WelcomePage() { variant="brand" className="w-full sm:w-auto" disabled={isOffline} - onClick={() => router.push("/setup/business")} + onClick={() => router.push("/setup/api-keys")} > Get Started diff --git a/apps/web/src/lib/config.ts b/apps/web/src/lib/config.ts new file mode 100644 index 0000000..e19c012 --- /dev/null +++ b/apps/web/src/lib/config.ts @@ -0,0 +1,7 @@ +export const config = { + isCloud: process.env.NEXT_PUBLIC_CLAWE_EDITION === "cloud", + authProvider: (process.env.NEXT_PUBLIC_AUTH_PROVIDER ?? "nextauth") as + | "nextauth" + | "cognito", + convexUrl: process.env.NEXT_PUBLIC_CONVEX_URL ?? "", +} as const; diff --git a/apps/web/src/lib/squadhub/actions.ts b/apps/web/src/lib/squadhub/actions.ts index dc85a51..3e10bb0 100644 --- a/apps/web/src/lib/squadhub/actions.ts +++ b/apps/web/src/lib/squadhub/actions.ts @@ -3,12 +3,14 @@ import { checkHealth, getConfig, + patchConfig, saveTelegramBotToken as saveTelegramBotTokenClient, removeTelegramBotToken as removeTelegramBotTokenClient, probeTelegramToken, approvePairingCode as approvePairingCodeClient, parseToolText, } from "@clawe/shared/squadhub"; +import type { SquadhubConnection } from "@clawe/shared/squadhub"; import { getConnection } from "./connection"; export async function checkSquadhubHealth() { @@ -76,3 +78,27 @@ export async function approvePairingCode( export async function removeTelegramBot() { return removeTelegramBotTokenClient(getConnection()); } + +/** + * Patch API keys into squadhub gateway config via patchConfig. + * Sets keys as env vars in the config — the gateway resolves these + * as fallback credentials for each provider. + */ +export async function patchApiKeys( + anthropicApiKey?: string, + openaiApiKey?: string, + connection?: SquadhubConnection, +) { + const env: Record = {}; + + if (anthropicApiKey) { + env.ANTHROPIC_API_KEY = anthropicApiKey; + } + if (openaiApiKey) { + env.OPENAI_API_KEY = openaiApiKey; + } + + if (Object.keys(env).length === 0) return; + + return patchConfig(connection ?? getConnection(), { env }); +} diff --git a/docker-compose.yml b/docker-compose.yml index a44b090..70e3c5f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,6 +8,7 @@ services: environment: - NODE_ENV=production - NEXT_PUBLIC_CONVEX_URL=${CONVEX_URL} + - NEXT_PUBLIC_CLAWE_EDITION=${CLAWE_EDITION:-oss} - SQUADHUB_URL=http://squadhub:18789 - SQUADHUB_TOKEN=${SQUADHUB_TOKEN} - SQUADHUB_STATE_DIR=/squadhub-data/config @@ -25,8 +26,6 @@ services: user: "${UID:-1000}:${GID:-1000}" environment: - SQUADHUB_TOKEN=${SQUADHUB_TOKEN} - - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - - OPENAI_API_KEY=${OPENAI_API_KEY:-} - CONVEX_URL=${CONVEX_URL} volumes: - squadhub-data:/data diff --git a/packages/backend/convex/schema.ts b/packages/backend/convex/schema.ts index 19672d2..2a3df92 100644 --- a/packages/backend/convex/schema.ts +++ b/packages/backend/convex/schema.ts @@ -43,6 +43,7 @@ export default defineSchema({ squadhubServiceArn: v.optional(v.string()), efsAccessPointId: v.optional(v.string()), anthropicApiKey: v.optional(v.string()), + openaiApiKey: v.optional(v.string()), settings: v.optional( v.object({ timezone: v.optional(v.string()), diff --git a/packages/backend/convex/tenants.ts b/packages/backend/convex/tenants.ts index b737a6d..94d94f8 100644 --- a/packages/backend/convex/tenants.ts +++ b/packages/backend/convex/tenants.ts @@ -130,11 +130,12 @@ export const updateStatus = mutation({ }, }); -// Store Anthropic API key in tenant record -export const setAnthropicKey = mutation({ +// Store API keys in tenant record +export const setApiKeys = mutation({ args: { machineToken: v.optional(v.string()), - apiKey: v.string(), + anthropicApiKey: v.optional(v.string()), + openaiApiKey: v.optional(v.string()), }, handler: async (ctx, args) => { const tenantId = await resolveTenantId(ctx, args); @@ -143,10 +144,49 @@ export const setAnthropicKey = mutation({ throw new Error("Tenant not found"); } - await ctx.db.patch(tenantId, { - anthropicApiKey: args.apiKey, - updatedAt: Date.now(), - }); + const patch: Record = { updatedAt: Date.now() }; + if (args.anthropicApiKey !== undefined) { + patch.anthropicApiKey = args.anthropicApiKey; + } + if (args.openaiApiKey !== undefined) { + patch.openaiApiKey = args.openaiApiKey; + } + + await ctx.db.patch(tenantId, patch); + }, +}); + +// Get masked API keys for display (last 4 chars only) +export const getApiKeys = query({ + args: {}, + handler: async (ctx) => { + const user = await getUser(ctx); + + const membership = await ctx.db + .query("accountMembers") + .withIndex("by_user", (q) => q.eq("userId", user._id)) + .first(); + + if (!membership) + return { anthropicApiKey: undefined, openaiApiKey: undefined }; + + const tenant = await ctx.db + .query("tenants") + .withIndex("by_account", (q) => q.eq("accountId", membership.accountId)) + .first(); + + if (!tenant) return { anthropicApiKey: undefined, openaiApiKey: undefined }; + + const mask = (key: string | undefined) => { + if (!key) return undefined; + const last4 = key.slice(-4); + return `${"*".repeat(Math.max(0, key.length - 4))}${last4}`; + }; + + return { + anthropicApiKey: mask(tenant.anthropicApiKey), + openaiApiKey: mask(tenant.openaiApiKey), + }; }, }); diff --git a/scripts/start.sh b/scripts/start.sh index e7bc598..76b0829 100755 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -32,7 +32,7 @@ if [ ! -f .env ]; then fi echo_info "Generated SQUADHUB_TOKEN: ${TOKEN:0:8}..." - echo_warn "Please edit .env and set your ANTHROPIC_API_KEY and CONVEX_URL" + echo_warn "Please edit .env and set your CONVEX_URL" fi # Load .env @@ -43,10 +43,6 @@ set +a # Validate required environment variables MISSING_VARS=() -if [ -z "$ANTHROPIC_API_KEY" ] || [ "$ANTHROPIC_API_KEY" = "sk-ant-..." ]; then - MISSING_VARS+=("ANTHROPIC_API_KEY") -fi - if [ -z "$CONVEX_URL" ] || [ "$CONVEX_URL" = "https://your-deployment.convex.cloud" ]; then MISSING_VARS+=("CONVEX_URL") fi @@ -65,11 +61,6 @@ if [ ${#MISSING_VARS[@]} -gt 0 ]; then exit 1 fi -# Warn about optional variables -if [ -z "$OPENAI_API_KEY" ]; then - echo_warn "OPENAI_API_KEY not set - image generation will be disabled" -fi - # Build packages needed for Docker images echo_info "Building packages..." pnpm install --frozen-lockfile diff --git a/steps.md b/steps.md new file mode 100644 index 0000000..cd32984 --- /dev/null +++ b/steps.md @@ -0,0 +1,1729 @@ +# Clawe Cloud Deployment Plan + +## Goal + +Deploy Clawe as a multi-tenant SaaS on AWS, where each user signs up, provides their Anthropic API key, and gets their own squadhub service. All infrastructure managed via CDK with full dev/prod environment separation. + +## Architecture Overview + +``` + Users (browser) + │ + ▼ + ┌──────────┐ ┌────────────┐ ┌──────────┐ + │ Route 53 │────▶│ CloudFront │────▶│ ALB │ + └──────────┘ └────────────┘ └────┬─────┘ + │ + ┌─────────────▼──────────────┐ + │ clawe-cluster-{env} │ + │ ┌────────────────────┐ │ + │ │ Web App (Fargate) │ │ + │ │ Shared, all tenants│ │ + │ └───┬────────────────┘ │ + │ │ │ + │ ┌───▼────────────────┐ │ + │ │ Watcher (Fargate) │ │ + │ │ Shared, polls all │ │ + │ └───┬────────────────┘ │ + └──────┼──────────────────────┘ + │ + ┌────────────────┐│┌────────────────┐ + │ │││ │ + ▼ ▼│▼ ▼ + ┌───────────┐ ┌─────▼─────────────────────┐ + │ Convex │ │ clawe-squadhubs-{env} │ + │ (Managed) │ │ ┌────────┐ ┌────────┐ │ + │ Multi- │ │ │SH 1 │ │SH 2 │... │ + │ tenant │ │ │Tenant A│ │Tenant B│ │ + └───────────┘ │ └───┬────┘ └───┬────┘ │ + └──────┼──────────┼─────────┘ + │ │ + ┌───────────────▼──────────▼──┐ + │ EFS │ + │ ┌──────────┐ ┌──────────┐ │ + │ │/tenant-A │ │/tenant-B │ │ + │ └──────────┘ └──────────┘ │ + └─────────────────────────────┘ + + Routing: Web App/Watcher → CloudMap DNS → Squadhub private IP + Auth: Browser: Google → Cognito JWT → user record → accountMembers → account → tenantId + Machine: SQUADHUB_TOKEN → tenant record → tenantId (CLI/watcher) + Images: ECR (shared across envs, tagged per env) +``` + +## Environment Strategy + +All CDK stacks are parameterized by environment (`dev` | `prod`): + +```bash +cdk deploy --context env=dev # Deploys to dev +cdk deploy --context env=prod # Deploys to prod +``` + +Each environment gets completely isolated resources: + +| Resource | Dev | Prod | +| ----------------------- | ----------------------------------------- | ------------------------------------------ | +| Domain | app-dev.clawe.io | app.clawe.io | +| AUTH_PROVIDER | `cognito` | `cognito` | +| Cognito User Pool | clawe-userpool-dev | clawe-userpool-prod | +| Cognito Domain | clawe-dev.auth.{region}.amazoncognito.com | clawe-prod.auth.{region}.amazoncognito.com | +| Sign-in method | Google (social login only) | Google (social login only) | +| ECS Cluster (shared) | clawe-cluster-dev | clawe-cluster-prod | +| ECS Cluster (squadhubs) | clawe-squadhubs-dev | clawe-squadhubs-prod | +| VPC | clawe-vpc-dev | clawe-vpc-prod | +| ALB | clawe-alb-dev | clawe-alb-prod | +| ECR Repos | Shared across envs | Same images, different tags | +| Convex Deployment | Separate dev deployment | Separate prod deployment | +| CloudMap Namespace | clawe-internal-dev | clawe-internal-prod | + +--- + +## Phase 1: Shared Client & Watcher Refactor + +Code-only changes. No infrastructure needed. Makes the codebase multi-tenant ready. + +### 1.1 Refactor shared squadhub client + +Update `packages/shared/src/squadhub/client.ts`: + +- [x] All exported functions take `SquadhubConnection` (`{ squadhubUrl, squadhubToken }`) as first parameter +- [x] Remove the lazy singleton `getSquadhubClient()` — create client per call +- [x] This makes the client stateless and multi-tenant safe +- [x] Apply same pattern to all functions: `saveTelegramBotToken`, `getConfig`, `patchConfig`, `listSessions`, `sessionsSend`, `sendMessage`, `cronList`, `cronAdd` +- [x] Update `pairing.ts` (`approveChannelPairingCode` takes connection param) +- [x] Export `SquadhubConnection` type from `index.ts` +- [x] Update all tests in `client.spec.ts` + +### 1.2 Refactor shared gateway client + +Update `packages/shared/src/squadhub/gateway-client.ts`: + +- [x] `GatewayClient` constructor accepts `url` and `token` as parameters (verified, no env var fallbacks) +- [x] `createGatewayClient()` requires `SquadhubConnection` as first param (no env var fallback) +- [x] `getSharedClient()` in `shared-client.ts` requires `SquadhubConnection` as first param +- [x] Web app callers pass connection explicitly + +### 1.3 Update web app server actions + +Update `apps/web/src/lib/squadhub/actions.ts`: + +- [x] Each action reads `squadhubUrl` and `squadhubToken` from env (for now — will become per-tenant later) +- [x] Passes them explicitly to shared client functions +- [x] This is the adapter layer: currently reads env, later resolves from tenant record + +### 1.4 Update web app API routes + +Update all API routes that use the squadhub client: + +- [x] `/api/squadhub/health` — pass `squadhubUrl`/`squadhubToken` explicitly +- [x] `/api/squadhub/pairing` — same +- [x] `/api/chat` — reads env at call site via `getSquadhubConfig()` +- [x] `/api/chat/history` — passes connection to `getSharedClient()` +- [x] `/api/chat/abort` — passes connection to `getSharedClient()` +- [x] `/api/business/context` — moved env reads to `getEnvConfig()` called at request time + +### 1.5 Refactor watcher — tenant-aware polling loop + +Refactor `apps/watcher/src/index.ts`: + +- [x] Watcher passes `SquadhubConnection` to all shared client calls (`sessionsSend`, `cronList`, `cronAdd`) +- [x] Connection created from existing `config.squadhubUrl`/`config.squadhubToken` +- [x] Multi-tenant polling loop (`getActiveTenants()` → iterate over tenants in delivery and cron setup) +- [x] `deliverToAgent` and `setupCrons` accept `SquadhubConnection` param +- [x] `deliveryLoop` iterates over tenants, passes tenant connection to each squadhub delivery +- [x] Backwards compatibility: `getActiveTenants()` returns single tenant from env vars +- [x] System-level auth with `WATCHER_TOKEN` → implemented in 2.7 (`validateWatcherToken` + `tenants.listActive`) +- [x] Tenant-scoped Convex calls with per-tenant `machineToken` → implemented in 2.6 (added `machineToken` to all queries) + +Current loop: + +``` +every 2s: poll notifications → deliver to SQUADHUB_URL +every 10s: check routines → trigger via SQUADHUB_URL +``` + +New loop: + +``` +every 2s: + 1. Query Convex for all active tenants (tenants.listActive, authenticated with WATCHER_TOKEN) + 2. For each tenant: + a. Query undelivered notifications using tenant's squadhubToken as machineToken + b. Deliver to tenant's squadhubUrl +every 10s: + 1. Query Convex for all active tenants (same WATCHER_TOKEN) + 2. For each tenant: + a. Check due routines using tenant's squadhubToken as machineToken + b. Trigger via tenant's squadhubUrl +``` + +Authentication: + +- System-level calls (`tenants.listActive`) use `WATCHER_TOKEN` env var +- Tenant-scoped calls (`notifications.getUndelivered`, `routines`) use each tenant's `squadhubToken` as `machineToken` (returned by `listActive`) + +For backwards compatibility during development: if no tenants table exists yet, fall back to env-based single-tenant mode. + +### 1.6 Move startup logic from watcher to provisioning route + +- [x] Remove `registerAgents()` from watcher → moved to `apps/web/src/lib/squadhub/setup.ts` +- [x] Remove `setupCrons()` from watcher → moved to `apps/web/src/lib/squadhub/setup.ts` +- [x] Remove `seedRoutines()` from watcher → moved to `apps/web/src/lib/squadhub/setup.ts` +- [x] Create `POST /api/tenant/provision` route that runs all setup (agents, crons, routines) +- [x] Watcher is now purely a polling/delivery service (no startup side effects) + +### 1.7 Update CLI for multi-tenancy + +The CLI (`packages/cli`) communicates with Convex directly via `ConvexHttpClient`. It needs to authenticate as a machine caller using the per-tenant `SQUADHUB_TOKEN`. + +- [x] CLI reads `SQUADHUB_TOKEN` from env and exports as `machineToken` +- [x] Update `packages/cli/src/client.ts` — added `query`/`mutation`/`action` wrapper functions (pass-through for now; Phase 2.6 will inject `machineToken`) +- [x] Update all 17 command handlers in `packages/cli/src/commands/` to use wrappers instead of `client.query()`/`client.mutation()` +- [x] Update all 16 test files to match new mock pattern + +### 1.8 Rebuild and test + +- [x] Rebuild shared package: `pnpm --filter @clawe/shared build` +- [x] Rebuild CLI: `pnpm --filter @clawe/cli build` +- [x] Run all tests: `pnpm test` — 155 tests passing (shared 21, cli 64, watcher 5, web 65) +- [x] Verify local docker compose still works (single-tenant mode with env vars) + +--- + +## Phase 2: Multi-Tenancy in Convex + +### 2.1 Add users table + +- [x] Add to `packages/backend/convex/schema.ts`: + +```typescript +users: defineTable({ + email: v.string(), // From Cognito JWT (immutable, unique) + name: v.optional(v.string()), // Display name + createdAt: v.number(), + updatedAt: v.number(), +}) + .index("by_email", ["email"]), +``` + +Cognito is only the authentication layer. The `users` table in Convex is the source of truth for user data. On first login, a user record is created from the Cognito JWT email. + +### 2.2 Add accounts + tenants tables + +- [x] Add `accounts` table to `packages/backend/convex/schema.ts`: + +```typescript +accounts: defineTable({ + name: v.optional(v.string()), // "Guy's Account" + createdAt: v.number(), + updatedAt: v.number(), +}), +``` + +- [x] Add `tenants` table with `accountId` (tenant belongs to an account, not a user): + +```typescript +tenants: defineTable({ + accountId: v.id("accounts"), // Account that owns this tenant + status: v.union( + v.literal("provisioning"), // Squadhub being created + v.literal("active"), // Squadhub running + v.literal("stopped"), // Squadhub stopped (future: sleep mode) + v.literal("error"), // Provisioning failed + ), + squadhubUrl: v.optional(v.string()), // Internal squadhub URL (CloudMap DNS) + squadhubToken: v.optional(v.string()), // Per-tenant squadhub token + squadhubServiceArn: v.optional(v.string()), // ECS Service ARN + efsAccessPointId: v.optional(v.string()), // EFS access point + anthropicApiKey: v.optional(v.string()), // User's Anthropic API key + settings: v.optional(v.object({ + timezone: v.optional(v.string()), + })), + createdAt: v.number(), + updatedAt: v.number(), +}) + .index("by_account", ["accountId"]) + .index("by_status", ["status"]) + .index("by_squadhubToken", ["squadhubToken"]), +``` + +### 2.3 Add accountMembers join table + +- [x] Add to `packages/backend/convex/schema.ts`: + +```typescript +accountMembers: defineTable({ + userId: v.id("users"), + accountId: v.id("accounts"), + role: v.union(v.literal("owner"), v.literal("member")), + createdAt: v.number(), +}) + .index("by_user", ["userId"]) + .index("by_account", ["accountId"]) + .index("by_user_account", ["userId", "accountId"]), +``` + +Access model: `user → accountMembers → account → tenants` + +- A user belongs to an account via `accountMembers` +- An account can own multiple tenants (future: plan-based limits) +- A tenant belongs to exactly one account +- If you're a member of an account, you can access all its tenants + +### 2.4 Add tenantId to all existing tables + +Add `tenantId: v.id("tenants")` field to every table: + +- [x] `agents` — add field + index `by_tenant: ["tenantId"]` +- [x] `tasks` — add field + index `by_tenant: ["tenantId"]` +- [x] `messages` — add field + index `by_tenant: ["tenantId"]` +- [x] `notifications` — add field + index `by_tenant: ["tenantId"]` +- [x] `activities` — add field + index `by_tenant: ["tenantId"]` +- [x] `documents` — add field + index `by_tenant: ["tenantId"]` +- [x] `businessContext` — add field + index `by_tenant: ["tenantId"]` +- [x] `channels` — add field + compound index `by_tenant_type: ["tenantId", "type"]` +- [x] `routines` — add field + index `by_tenant: ["tenantId"]` + +### 2.5 Create user and tenant resolution helpers + +Create `packages/backend/convex/lib/auth.ts`: + +Two auth paths — browser (JWT) and machine (token): + +**Browser path (web app):** + +- [x] Helper function `getUser(ctx)` — gets auth identity, extracts email, looks up `users` table by email +- [x] Helper function `getTenantIdFromJwt(ctx)` — calls `getUser(ctx)`, queries `accountMembers` by user → account → `tenants` by account, returns first tenant's `_id` + +**Machine path — per-tenant (CLI):** + +- [x] Helper function `getTenantIdFromToken(ctx, machineToken)` — queries `tenants` by `by_squadhubToken` index, returns tenant `_id` + +**Machine path — system-level (watcher):** + +- [x] Helper function `validateWatcherToken(ctx, watcherToken)` — reads `WATCHER_TOKEN` from Convex env vars, compares with provided token +- [x] Wire into watcher's `getActiveTenants()` (done in 2.7) + +**Unified resolver (for tenant-scoped functions):** + +- [x] Helper function `resolveTenantId(ctx, args)` + `resolveTenantIdMut(ctx, args)`: + 1. If `args.machineToken` is provided → `getTenantIdFromToken(ctx, args.machineToken)` + 2. If `AUTH_ENABLED=true` → `getTenantIdFromJwt(ctx)` (requires valid JWT) + 3. If `AUTH_ENABLED=false` → `getOrCreateDevTenant(ctx)` (auto-creates a default dev tenant, mutation variant only) + +Create `packages/backend/convex/users.ts`: + +- [x] `getOrCreateFromAuth(ctx)` — called on every login, creates user record if needed +- [x] `get()` — get current authenticated user +- [x] `update(name)` — update profile + +### 2.6 Update all Convex queries and mutations + +Every query/mutation needs to accept optional `machineToken` arg, call `resolveTenantId(ctx, args)`, filter reads by `tenantId`, include `tenantId` in writes. + +Example — `agents.list`: + +```typescript +// Before: +export const list = query({ + args: {}, + handler: async (ctx) => { + return await ctx.db.query("agents").collect(); + }, +}); + +// After: +export const list = query({ + args: { machineToken: v.optional(v.string()) }, + handler: async (ctx, args) => { + const tenantId = await resolveTenantId(ctx, args); + return await ctx.db + .query("agents") + .withIndex("by_tenant", (q) => q.eq("tenantId", tenantId)) + .collect(); + }, +}); +``` + +- Browser calls (web app): `useQuery(api.agents.list)` — no `machineToken`, uses JWT +- Machine calls (CLI): `client.query(api.agents.list, { machineToken })` — uses token + +Files to update: + +- [x] `convex/agents.ts` — all functions +- [x] `convex/tasks.ts` — all functions +- [x] `convex/messages.ts` — all functions +- [x] `convex/notifications.ts` — all functions +- [x] `convex/activities.ts` — all functions +- [x] `convex/documents.ts` — all functions +- [x] Remove `convex/settings.ts` — migrated `timezone` to `tenants.settings` and `onboardingComplete` to `accounts.onboardingComplete`, dropped `settings` table from schema. Created `convex/tenants.ts` with `getTimezone`, `setTimezone`. Created `convex/accounts.ts` with `isOnboardingComplete`, `completeOnboarding`. Updated callers: + - `apps/web/src/app/(dashboard)/settings/general/_components/timezone-settings.tsx` → `api.tenants.getTimezone` / `api.tenants.setTimezone` + - `apps/web/src/hooks/use-onboarding-guard.ts` → `api.accounts.isOnboardingComplete` + - `apps/web/src/app/page.tsx` → `api.accounts.isOnboardingComplete` + - `apps/web/src/app/setup/complete/page.tsx` → `api.accounts.completeOnboarding` + - `apps/web/src/app/setup/provisioning/page.tsx` → `api.accounts.isOnboardingComplete` + - `apps/watcher/src/index.ts` → `api.tenants.getTimezone` (with `machineToken`) +- [x] `convex/businessContext.ts` — all functions +- [x] `convex/channels.ts` — all functions +- [x] `convex/routines.ts` — all functions +- [x] Update watcher to pass `machineToken` in all Convex calls (`agents.list`, `notifications.getUndelivered`, `notifications.markDelivered`, `routines.getDueRoutines`, `routines.trigger`, `tenants.getTimezone`) + +### 2.7 Create tenant management functions + +Create `packages/backend/convex/tenants.ts`: + +- [x] `create(accountId)` — creates tenant with `accountId`, status "provisioning" +- [x] `getForCurrentUser()` — gets tenant for authenticated user: + 1. Call `getUser(ctx)` to get user record + 2. Query `accountMembers.by_user` to find account membership + 3. Query `tenants.by_account` to find tenant in that account + 4. Return tenant +- [x] `updateStatus(tenantId, status, squadhubUrl?, serviceArn?)` — update provisioning state +- [x] `setApiKeys(anthropicApiKey, openaiApiKey?)` — store API keys in tenant record +- [x] `getApiKeys()` — returns masked API keys (last 4 chars) for settings display +- [x] `listActive(watcherToken)` — get all active tenants with squadhubUrl + squadhubToken (requires valid `WATCHER_TOKEN`) +- [x] Wire watcher's `getActiveTenants()` to call `tenants.listActive` instead of env fallback (deferred from 1.5 → 2.5 → here) +- [x] Wire `validateWatcherToken` into `listActive` (from 2.5) + +--- + +## Phase 3: Authentication (Always-On Auth) + +Auth is **always on**. Two providers, one interface: + +- **Cloud**: Cognito (Google social login) — JWKS validated by Convex +- **Local / self-hosted**: NextAuth with Credentials provider — auto-login, zero config + +The `AUTH_PROVIDER` env var (`nextauth` | `cognito`) controls which provider is active. Both paths produce valid JWTs with the same claims shape. All `AUTH_ENABLED` conditionals have been eliminated. + +### 3.1 Convex Auth integration + +- [x] `packages/backend/convex/auth.config.nextauth.ts` — NextAuth `customJwt` provider (JWKS URL) +- [x] `packages/backend/convex/auth.config.cognito.ts` — Cognito provider (kept from before) +- [x] `packages/backend/convex/auth.config.ts` — Committed as NextAuth version (default for local dev). Deploy script overwrites for cloud. +- [x] `packages/backend/convex/dev-jwks/` — Committed dev RSA key pair + JWKS JSON for local JWT signing +- [x] `packages/backend/convex/lib/auth.ts` — Simplified: removed `AUTH_ENABLED` checks, dev-tenant logic, `resolveTenantIdMut`. Single `resolveTenantId` works for both queries and mutations. +- [x] `scripts/convex-deploy.sh` — Updated: uses `AUTH_PROVIDER` instead of `AUTH_ENABLED` + +### 3.2 Web app — Auth provider and login page + +Dual-provider architecture selected by `NEXT_PUBLIC_AUTH_PROVIDER`: + +- [x] `apps/web/src/lib/auth/nextauth-config.ts` — NextAuth v5 config with Credentials + RS256 JWT signing +- [x] `apps/web/src/app/api/auth/[...nextauth]/route.ts` — NextAuth API route handler +- [x] `apps/web/src/app/api/auth/jwks/route.ts` — JWKS endpoint for Convex to validate NextAuth JWTs +- [x] `apps/web/src/providers/auth-provider.tsx` — Dual provider: NextAuth (SessionProvider) or Cognito (Amplify, dynamic import). Shared `AuthContextValue` interface. `useConvexAuth` hook for ConvexProviderWithAuth. +- [x] `apps/web/src/providers/convex-provider.tsx` — Always uses `ConvexProviderWithAuth` (no conditional) +- [x] `apps/web/src/app/auth/login/page.tsx` — Email form (NextAuth, auto-login for local dev) or Google button (Cognito) +- [x] `apps/web/src/hooks/use-user-menu.ts` — Simplified: always reads from `useAuth()`, no `AUTH_ENABLED` branches +- [x] `apps/web/src/hooks/use-onboarding-guard.ts` — Simplified: always checks auth, no `AUTH_ENABLED` branches +- [x] `.env.example` + `apps/web/.env.*` — Updated for `AUTH_PROVIDER`, `NEXTAUTH_SECRET`, `NEXTAUTH_URL` + +### 3.3 Web app — Auth middleware + +- [x] Protect all routes except `/auth/login` and `/api/health` — `apps/web/src/middleware.ts` checks session cookie, allows `PUBLIC_PATHS` +- [x] Redirect unauthenticated users to `/auth/login` — middleware redirects when no `authjs.session-token` cookie +- [x] Pass tenant context to downstream routes — tenant resolved in Convex via JWT email → user → accountMembers → tenant + +### 3.4 Web app — Auth UI components + +- [x] User menu in dashboard sidebar (email, logout) — wired `signOut` from `useAuth` +- [x] Onboarding guard hook checks auth state — redirects to `/auth/login` when not authenticated + +--- + +## Phase 4: Dev & Self-Hosted Provisioning + +**Goal**: After this phase, local dev and self-hosted deployments work end-to-end. A user can sign in, get auto-provisioned, go through onboarding, use the dashboard, and the watcher operates correctly — all without any cloud infrastructure. + +Everything remaining after this phase is cloud-only (CDK, AWS, Cognito, multi-tenant squadhub infrastructure). This clean boundary allows a separate `clawe-cloud` repo to extend this codebase with cloud-specific plugins without modifying the base code. + +### 4.1 Plugin architecture (`packages/plugins/`) + +Create a `@clawe/plugins` package that provides interfaces for cloud-extensible behavior and a plugin registry. The base repo contains interfaces + dev defaults. The `clawe-cloud` repo adds `packages/cloud-plugins/` that implements these interfaces with AWS. + +**Package structure:** + +``` +packages/plugins/ +├── src/ +│ ├── index.ts # Main exports: loadPlugins, hasPlugin, getPlugin, interfaces +│ ├── registry.ts # Plugin registry internals +│ ├── interfaces/ +│ │ ├── index.ts # Re-exports all interfaces +│ │ ├── squadhub-provisioner.ts # SquadhubProvisioner interface +│ │ └── squadhub-lifecycle.ts # SquadhubLifecycle interface +│ └── defaults/ +│ ├── index.ts # Default implementations +│ ├── squadhub-provisioner.ts # DefaultSquadhubProvisioner (reads env vars) +│ └── squadhub-lifecycle.ts # DefaultSquadhubLifecycle (no-ops) +├── package.json +└── tsconfig.json +``` + +**Plugin interfaces:** + +```typescript +// interfaces/squadhub-provisioner.ts +export interface SquadhubProvisioner { + /** + * Create infrastructure for a new tenant. + * Dev: reads SQUADHUB_URL + SQUADHUB_TOKEN from env, returns immediately. + * Cloud: creates ECS service + EFS access point + CloudMap entry, waits for health. + */ + provision(params: { + tenantId: string; + accountId: string; + anthropicApiKey?: string; + convexUrl: string; + }): Promise<{ + squadhubUrl: string; + squadhubToken: string; + metadata?: Record; // Cloud: ECS ARN, EFS AP ID, etc. + }>; + + /** + * Check provisioning progress (for polling UI). + * Dev: always returns "active" (instant). + * Cloud: returns "provisioning" while ECS service is starting. + */ + getProvisioningStatus(tenantId: string): Promise<{ + status: "provisioning" | "active" | "error"; + message?: string; + }>; + + /** + * Tear down all infrastructure for a tenant. + * Dev: no-op. + * Cloud: deletes ECS service + EFS access point + CloudMap entry. + */ + deprovision(params: DeprovisionParams): Promise; +} + +// interfaces/squadhub-lifecycle.ts +export interface SquadhubLifecycle { + /** + * Restart the tenant's squadhub service (e.g., after API key change). + * Dev: no-op (user manually restarts docker). + * Cloud: forces new ECS task deployment. + */ + restart(tenantId: string): Promise; + + /** + * Stop the tenant's squadhub service. + * Dev: no-op. + * Cloud: sets ECS desiredCount to 0. + */ + stop(tenantId: string): Promise; + + /** + * Destroy tenant's squadhub resources permanently. + * Dev: no-op. + * Cloud: deletes ECS service + EFS access point + CloudMap entry. + */ + destroy(tenantId: string): Promise; + + /** + * Check health/status of the tenant's squadhub. + * Dev: returns { running: true, healthy: true }. + * Cloud: checks ECS service running count + task health. + */ + getStatus(tenantId: string): Promise<{ + running: boolean; + healthy: boolean; + }>; +} +``` + +**Registry:** + +```typescript +// registry.ts +export interface PluginMap { + "squadhub-provisioner": SquadhubProvisioner; + "squadhub-lifecycle": SquadhubLifecycle; +} + +let plugins: PluginMap; // Always set (dev defaults or cloud) +let cloudLoaded = false; + +/** + * Initialize plugins. Call once at app startup. + * Tries to load @clawe/cloud-plugins. Falls back to dev defaults. + */ +export async function loadPlugins(): Promise { + try { + const cloud = await import("@clawe/cloud-plugins"); + plugins = cloud.register(); + cloudLoaded = true; + } catch { + plugins = { + "squadhub-provisioner": new DefaultSquadhubProvisioner(), + "squadhub-lifecycle": new DefaultSquadhubLifecycle(), + }; + } +} + +/** Returns true if cloud plugins are loaded (vs dev defaults). */ +export function hasPlugin(name: keyof PluginMap): boolean { + return cloudLoaded; +} + +/** Get a plugin implementation. Always returns something (cloud or dev default). */ +export function getPlugin(name: K): PluginMap[K] { + return plugins[name]; +} +``` + +**Dev default implementations:** + +```typescript +// defaults/squadhub-provisioner.ts — DefaultSquadhubProvisioner +export class DefaultSquadhubProvisioner implements SquadhubProvisioner { + async provision(params) { + // Read from env vars — single-tenant dev mode + return { + squadhubUrl: process.env.SQUADHUB_URL || "http://localhost:18790", + squadhubToken: process.env.SQUADHUB_TOKEN || "", + }; + } + + async getProvisioningStatus() { + return { status: "active" as const }; // Instant in dev + } + + async deprovision() {} // No-op in dev +} + +// defaults/squadhub-lifecycle.ts — DefaultSquadhubLifecycle +export class DefaultSquadhubLifecycle implements SquadhubLifecycle { + async restart() {} // No-op — user manages docker + async stop() {} // No-op + async destroy() {} // No-op + async getStatus() { + return { running: true, healthy: true }; + } +} +``` + +**Consumption pattern (call sites):** + +```typescript +import { getPlugin, hasPlugin } from "@clawe/plugins"; + +// Provision route — always works (dev or cloud) +const provisioner = getPlugin("squadhub-provisioner"); +const result = await provisioner.provision({ tenantId, accountId, convexUrl }); + +// Settings UI — conditionally show restart button +if (hasPlugin("squadhub-lifecycle")) { + // Show "Restart Service" button after API key change +} +``` + +Setup: + +- [x] Create `packages/plugins/` package with interfaces, registry, and dev defaults +- [x] Add to `pnpm-workspace.yaml` (already covered by `packages/*` glob) +- [x] Build outputs: ESM (consumable by Next.js, Node, and React) +- [x] Export types and runtime functions from package entry point + +### 4.2 Dev provision route + +Update `apps/web/src/app/api/tenant/provision/route.ts`: + +Current state: Only runs app-level setup (agents, crons, routines). Doesn't create the tenant or wire connection tokens. + +Changes: + +- [x] Call `loadPlugins()` at route level (ensures registry is initialized) +- [x] Authenticate the caller via NextAuth session (import `auth` from nextauth-config) +- [x] Get user email from session → look up user in Convex +- [x] Call Convex `accounts.getOrCreateForUser()` to ensure account + membership exist +- [x] Check if account already has a tenant (`tenants.getForCurrentUser`) +- [x] If no tenant: call `tenants.create(accountId)`, then use `getPlugin("squadhub-provisioner").provision()` to get `squadhubUrl` + `squadhubToken`, update tenant with these values + `status: "active"` +- [x] Run existing app-level setup (register agents, setup crons, seed routines) — already implemented in `setupTenant()` +- [x] Return `{ ok: true, tenantId }` + +The route is idempotent — calling it when a tenant already exists is a no-op (returns existing tenant). + +**Cloud extensibility**: The provision route uses `getPlugin("squadhub-provisioner")` which returns the dev provisioner by default (reads env vars, instant). When `@clawe/cloud-plugins` is installed (in `clawe-cloud` repo), the same route automatically uses the cloud provisioner (creates ECS/EFS/CloudMap, waits for health). No file changes needed. + +### 4.3 WATCHER_TOKEN setup + +The watcher uses two tokens for Convex authentication: + +1. **`WATCHER_TOKEN`** — system-level token for `tenants.listActive()` (get all active tenants) +2. **Per-tenant `squadhubToken`** — returned by `listActive()`, used as `machineToken` for tenant-scoped queries + +Setup: + +- [x] Add `WATCHER_TOKEN` to root `.env.example` with a default dev value (e.g., `clawe-watcher-dev-token`) +- [x] Add `WATCHER_TOKEN` to root `.env` with the same default +- [x] Add `WATCHER_TOKEN` to `turbo.json` `globalPassThroughEnv` +- [x] Set `WATCHER_TOKEN` as a Convex env var — auto-synced on `pnpm dev` via `scripts/sync-convex-env.sh` (runs as background process in backend dev script, waits for local Convex backend, then sets all env vars) +- [x] Document in `.env.example`: "Must match the value set in Convex env vars" + +After the provision route (4.2) runs, the tenant record has `squadhubToken` set to `SQUADHUB_TOKEN` from env. The watcher calls `tenants.listActive(watcherToken)` → gets the tenant's `squadhubToken` → passes it as `machineToken` to all tenant-scoped Convex calls. The full chain works. + +### 4.4 Auto-provision flow in web app + +After login, the app needs to automatically provision the user if they don't have an active tenant. + +- [x] Login page (`/auth/login`) only does: authenticate → `getOrCreateFromAuth()` → redirect to `/` +- [x] Root page (`/`) checks tenant status: no active tenant → redirect to `/setup/provisioning` +- [x] Provisioning page (`/setup/provisioning`) calls `POST /api/tenant/provision`, shows loading/error UI +- [x] Onboarding guards check tenant status before checking `isOnboardingComplete` +- [x] For returning users (tenant already exists), the idempotent provision route returns immediately. + +Flow after this step: + +``` +Login → auth → getOrCreateFromAuth → redirect to / + → root page queries tenant: + → no active tenant: redirect to /setup/provisioning + → provision → redirect to /setup/welcome (or /board if onboarding complete) + → has active tenant, not onboarded: redirect to /setup + → has active tenant, onboarded: redirect to /board +``` + +### 4.5 Environment variables summary + +Root `.env.example` additions: + +```bash +# Watcher system token (must also be set as Convex env var) +WATCHER_TOKEN=clawe-watcher-dev-token +``` + +`turbo.json` `globalPassThroughEnv` additions: + +``` +WATCHER_TOKEN +``` + +Convex env vars (set via CLI): + +```bash +npx convex env set WATCHER_TOKEN clawe-watcher-dev-token +``` + +### 4.6 Verification + +After this phase, verify the complete flow: + +1. **Fresh start**: Clear Convex data, restart services +2. **Login**: `AUTO_LOGIN_EMAIL` auto-signs in (or Google OAuth if configured) +3. **Auto-provision**: User + account + tenant created, tenant has `squadhubUrl`/`squadhubToken` from env, agents registered, crons set up +4. **Onboarding**: `/setup/welcome` → business context → telegram → complete — all tenant-scoped queries resolve +5. **Dashboard**: All pages work (agents, board, settings) — data is tenant-scoped +6. **Watcher**: `WATCHER_TOKEN` validates → `listActive` returns tenant → per-tenant `machineToken` resolves → notifications delivered + +### 4.7 Dedicated provisioning screen + +Move the provision step out of the login page into a shared onboarding screen. This screen is used by all modes (dev, self-hosted, cloud). + +**Why**: Login should only authenticate. Provisioning (creating tenant, registering agents, setting up crons) is a separate concern that deserves its own screen with loading state and error handling. For cloud, this step will take longer (spinning up ECS), so having a dedicated screen is essential. For dev/self-hosted it's instant but the flow should be the same. + +**Changes:** + +- [x] Remove `apiClient.post("/api/tenant/provision")` from `apps/web/src/app/auth/login/page.tsx` +- [x] Login page only does: authenticate → `getOrCreateFromAuth()` → redirect to `/` +- [x] Root page (`/`) queries `api.tenants.getForCurrentUser`, redirects to `/setup/provisioning` if no active tenant +- [x] Create `apps/web/src/app/setup/provisioning/page.tsx`: + - Calls `POST /api/tenant/provision` on mount + - Shows loading screen: spinner + "Setting up your workspace..." + - On success: Convex subscription updates reactively → redirect fires + - On error: show error message with retry button + - If tenant already active (idempotent): redirect immediately + - Uses `useRef` to prevent double-fire in StrictMode +- [x] Update onboarding guard (`use-onboarding-guard.ts`): check tenant exists and is active before checking `isOnboardingComplete`. If no active tenant → redirect to `/setup/provisioning`. Uses `usePathname()` to avoid redirect loop. +- [x] Rename `lib/squadhub/provision.ts` → `lib/squadhub/setup.ts`, `provisionTenant()` → `setupTenant()` (app-level setup, not infrastructure provisioning) +- [x] For cloud (future): poll `GET /api/tenant/status` while status is `"provisioning"`, show progress messages ("Creating workspace...", "Starting services...", "Almost ready...") + +**Updated flow:** + +``` +Login → auth → getOrCreateFromAuth → redirect to / + +/ (root page): + → query getForCurrentUser + → null or not active → /setup/provisioning + → active, not onboarded → /setup + → active, onboarded → /board + +/setup/provisioning: + → tenant already active? → redirect out + → call POST /api/tenant/provision + → spinner while provisioning + → Convex updates reactively → redirect to /setup/welcome or /board + +/setup/* (welcome, business, telegram, complete): + → guard: no active tenant → /setup/provisioning + → guard: already onboarded → /board + +/board (dashboard): + → guard: no active tenant → /setup/provisioning + → guard: not onboarded → /setup +``` + +--- + +## Phase 5: CDK Project Setup & Base Infrastructure + +### 5.1 Initialize CDK project + +Create the CDK project in `packages/infra/` (in `clawe-cloud` repo): + +``` +packages/infra/ +├── bin/ +│ └── clawe.ts # CDK app entry point +├── lib/ +│ ├── config.ts # Environment config (dev/prod values) +│ ├── networking-stack.ts # VPC, subnets, security groups +│ ├── auth-stack.ts # Cognito +│ ├── storage-stack.ts # EFS, ECR +│ ├── shared-services-stack.ts # ECS cluster, ALB, web app, watcher +│ ├── tenant-stack.ts # Per-tenant squadhub construct (Phase 7) +│ ├── dns-stack.ts # Route53, CloudFront, ACM +│ └── monitoring-stack.ts # CloudWatch dashboards, alarms +├── cdk.json +├── tsconfig.json +└── package.json +``` + +- [x] Add `infra` to pnpm workspace — lives in `packages/infra/`, covered by `packages/*` glob +- [x] Install CDK dependencies: `aws-cdk-lib`, `constructs`, `aws-cdk`, `@types/node`, `ts-node` +- [x] Configure `cdk.json` with context defaults (`env: "dev"`) +- [x] Create `bin/clawe.ts` — CDK app entry point wiring all stacks with `--context env=dev|prod` +- [x] Create `lib/config.ts` — environment config (domain, sizing, namespace per dev/prod) +- [x] Create `lib/networking-stack.ts` — VPC, 5 security groups (ALB, web, watcher, squadhub, EFS) +- [x] Create `lib/auth-stack.ts` — Cognito User Pool with Google social login +- [x] Create `lib/storage-stack.ts` — 3 ECR repos + encrypted EFS +- [x] Create `lib/shared-services-stack.ts` — 2 ECS clusters, ALB, web app + watcher Fargate services +- [x] Create `lib/dns-stack.ts` — Route53, ACM certificate, CloudFront distribution +- [x] Create `lib/monitoring-stack.ts` — CloudWatch dashboard + 5xx alarm +- [x] Type-check passes cleanly (`tsc --noEmit`) + +### 5.2 Cognito User Pool (Google social login only) + +All implemented in `auth-stack.ts` during 5.1. Custom domain (`auth.clawe.io`) instead of prefix domain. Added deletion protection, case-insensitive sign-in, prevent user existence errors (adopted from ChartDB). Removed dead `googleClientSecret` variable; added pre-deploy SSM setup instructions as comments. + +Create Cognito User Pool: + +- [x] Cognito is only the auth layer — user data lives in Convex `users` table +- [x] No self sign-up with email/password — Google is the only sign-in method (`selfSignUpEnabled: false`) +- [x] No custom Cognito attributes needed (only standard `email` + `fullname`) + +Configure Google as federated identity provider: + +- [x] Create OAuth 2.0 Client ID in Google Cloud Console (Web application type) + - Authorized redirect URI: `https://{authDomain}/oauth2/idpresponse` +- [x] Add Google as identity provider in Cognito User Pool: + - Client ID + Client Secret from SSM (`/clawe/{env}/google-client-id`, `/clawe/{env}/google-client-secret`) + - Scopes: `openid email profile` + - Attribute mapping: Google `email` → Cognito `email`, Google `name` → Cognito `fullname` + +Configure Cognito domain: + +- [x] Custom domain: `auth.clawe.io` (prod) / `auth-dev.clawe.io` (dev) with ACM cert + Route53 alias + +Create App Client: + +- [x] No client secret (public client for SPA) — `generateSecret: false` +- [x] Auth flows: `userSrp: true` (SRP kept as fallback, Google login uses OAuth flow) +- [x] OAuth settings: + - Callback URL: `https://app.clawe.io` (prod) / `https://app-dev.clawe.io` (dev) / `http://localhost:3000` (local) + - Sign-out URL: same as callback + - OAuth scopes: `openid`, `email`, `profile` + - Supported identity providers: Google +- [x] Token validity: access 1h, id 1h, refresh 30d + +- [x] Output User Pool ID, App Client ID, and Cognito Domain as stack outputs +- [x] Store Google OAuth Client ID and Secret in SSM Parameter Store — done for both dev and prod + +### 5.3 Environment config + +- [x] Created `packages/infra/lib/config.ts` during 5.1 — maps `env` context to all environment-specific values including `authDomain`, `cloudMapNamespace`, sizing for web/watcher/squadhub, `natGateways` + +### 5.4 Networking stack + +Created `packages/infra/lib/networking-stack.ts` during 5.1: + +- [x] 2 public subnets (ALB) +- [x] 2 private subnets with NAT (ECS tasks, EFS) +- [x] 1 NAT Gateway (dev) / 2 NAT Gateways (prod) for cost vs HA tradeoff +- [x] Security groups: + - `alb-sg`: inbound 443/80 from internet + - `web-sg`: inbound from ALB only (port 3000) + - `squadhub-sg`: inbound from web-sg and watcher-sg (port 18789) + - `watcher-sg`: outbound to squadhub-sg and internet (Convex) + - `efs-sg`: inbound NFS (2049) from squadhub-sg + +### 5.5 ECR repositories + +Created in `packages/infra/lib/storage-stack.ts` during 5.1: + +- [x] `clawe/web` — Next.js web app image +- [x] `clawe/squadhub` — Squadhub (OpenClaw gateway) image +- [x] `clawe/watcher` — Watcher service image +- [x] Lifecycle policy: keep last 10 images + +### 5.6 EFS filesystem + +Created in `packages/infra/lib/storage-stack.ts` during 5.1: + +- [x] Encrypted at rest +- [x] Performance mode: generalPurpose +- [x] Throughput mode: bursting (sufficient for tens of users) +- [x] Per-tenant access points created dynamically during provisioning (Phase 8.2 — CloudSquadhubProvisioner) +- [x] Mount targets in each private subnet (created in storage-stack.ts) + +### 5.7 SSM Parameter Store secrets + +Generate and store shared secrets in SSM Parameter Store (SecureString): + +- [x] `/clawe/{env}/watcher-token` (SecureString) — created for dev and prod +- [x] `/clawe/{env}/google-client-id` (String) — created for dev and prod +- [x] `/clawe/{env}/google-client-secret` (SecureString) — created for dev and prod +- [x] Set watcher token as Convex env var — automated in deploy workflow ("Sync Convex env vars" step reads from SSM, sets via `npx convex env set`) + +### 5.8 Shared wildcard certificate + +Created `lib/certificate-stack.ts`. Single ACM wildcard certificate for all subdomains, replacing per-service certs. + +- [x] Create `lib/certificate-stack.ts` — standalone stack (deployed in `us-east-1`) +- [x] Certificate covers: `*.clawe.io` + `clawe.io` (SAN) +- [x] DNS validation via Route53 hosted zone lookup +- [x] Export certificate ARN as stack output (`clawe-wildcard-certificate-arn`) +- [x] Deploy order: certificate-stack deploys **before** auth-stack and dns-stack +- [x] Update `auth-stack.ts` — removed per-domain `AuthCertificate`, accepts shared wildcard cert as prop +- [x] Update `dns-stack.ts` — removed per-domain certificate, accepts shared wildcard cert as prop +- [x] Update `shared-services-stack.ts` — ALB HTTPS listener now uses shared wildcard cert +- [x] Update `bin/clawe.ts` — added certificate stack, passes cert to auth-stack, dns-stack, and shared-services-stack + +CloudFront → ALB now uses HTTPS (not HTTP), since ALB has the wildcard cert. + +### 5.9 GitHub Actions OIDC for AWS + +Created `lib/ci-stack.ts`. Keyless AWS auth — no long-lived credentials in GitHub. + +- [x] Create `lib/ci-stack.ts` — deployed once per account (not per environment) +- [x] Create IAM OIDC provider for GitHub (`token.actions.githubusercontent.com`) +- [x] Create IAM role `clawe-github-ci` with trust policy: + - Principal: GitHub OIDC provider + - Condition: `sub` matches `repo:getclawe/clawe-cloud:*` (only this repo can assume) +- [x] Role permissions (CDK-native approach — assumes CDK bootstrap roles for deploy): + - `sts:AssumeRole` on `cdk-*` bootstrap roles (CDK deploy delegates to these) + - ECR: full push permissions (for Docker image builds done outside CDK) + - CloudFront: `CreateInvalidation` (post-deploy cache clear) + - SSM: `GetParameter` scoped to `/clawe/*` (read env config) + - STS: `GetCallerIdentity` +- [x] Export role ARN as stack output (`clawe-github-ci-role-arn`) +- [x] Add to `bin/clawe.ts` — deployed as `ClaweCi` (no env parameterization) + +### 5.10 CI/CD deploy workflow + +Created `.github/workflows/deploy.yml`. Manual (`workflow_dispatch`) — no auto-deploy on push. + +**Deployment approach**: All ECS service updates go through CDK (`cdk deploy`), not direct `aws ecs update-service`. CDK is the single source of truth — when the image tag changes, CloudFormation detects the task definition update and triggers ECS rolling deployment automatically. This prevents drift and gives us circuit breaker + auto-rollback for free. (Pattern adopted from ChartDB's backend-node deployment.) + +- [x] Trigger: `workflow_dispatch` with inputs: + - `environment`: choice of `dev` / `prod` + - `targets`: choice of `all` / `infra` / `web` / `watcher` / `squadhub` / `convex` +- [x] Authentication: `aws-actions/configure-aws-credentials` with OIDC (role from 5.9) +- [x] Steps (conditional on `targets`): + 1. **Checkout** + pnpm + Node.js setup + 2. **Configure AWS credentials** via OIDC + 3. **Build & push Docker images** to ECR (web, watcher, squadhub — each conditional) + 4. **Tag images**: `{repo}:{env}-{git-sha}` + `{repo}:{env}-latest` + 5. **Deploy Convex** via `./scripts/convex-deploy.sh` + 6. **CDK deploy** — single command handles infra + ECS updates: + ```bash + npx cdk deploy --all --context env={env} --context imageTag={env}-{sha} + ``` + 7. **Invalidate CloudFront cache** — reads distribution ID from stack outputs + 8. **Smoke test** — retries health check for up to 3 minutes +- [x] ECS deployment settings (defined in CDK shared-services-stack): + - Circuit breaker enabled with auto-rollback on health check failure + - MaximumPercent: 200% (new tasks start before old ones stop) + - MinimumHealthyPercent: 50% (allows rolling replacement) + - Health check grace period: 120s (allows startup time) +- [x] `imageTag` CDK context — shared-services-stack reads `this.node.tryGetContext("imageTag")`, defaults to `{env}-latest` +- [x] Hardcoded in workflow: role ARN (`arn:aws:iam::985539756675:role/clawe-github-ci`), account ID, ECR registry +- [x] GitHub environment secrets configured: `CONVEX_DEPLOY_KEY` (per env: dev + prod) +- [x] GitHub environment variables configured: `CONVEX_URL` (per env: dev + prod) +- [x] Removed `convexUrl` from `config.ts` — now required CDK context (`--context convexUrl=...`), fails fast if missing + +--- + +## Phase 6: Shared Services Deployment (CDK) + +All of Phase 6 was implemented in `shared-services-stack.ts` during Phase 5.1, with deployment config (circuit breaker, rolling updates) added in 5.8/5.10. + +### 6.1 ECS Clusters + +Created in `shared-services-stack.ts`: + +**Shared services cluster:** + +- [x] Name: `clawe-cluster-{env}` +- [x] Hosts: web app + watcher +- [x] Container insights enabled + +**Squadhubs cluster:** + +- [x] Name: `clawe-squadhubs-{env}` +- [x] Hosts: per-tenant squadhub ECS Services +- [x] CloudMap namespace: `clawe-internal-{env}` (for squadhub service discovery) +- [x] Container insights enabled +- [x] Separate cluster keeps tenant workloads isolated from shared services + +### 6.2 Application Load Balancer + +Created in `shared-services-stack.ts`: + +- [x] HTTPS listener (443) with shared wildcard certificate +- [x] HTTP listener (80) — redirect to HTTPS +- [x] Target groups: + - `web-tg`: routes to web app service (default rule) +- [x] Health check: `/api/health` (interval 30s, healthy 2, unhealthy 3) + +### 6.3 Web app Fargate service + +- [x] ECS service for the Next.js app (in `shared-services-stack.ts`): + +- Task definition: + - Image: `fromEcrRepository(webRepo, imageTag)` — image tag from CDK context + - CPU: 512 (dev) / 1024 (prod) + - Memory: 1024 (dev) / 2048 (prod) + - Port: 3000 + - Environment: `NODE_ENV`, `ENVIRONMENT`, `NEXT_PUBLIC_CONVEX_URL` (from context), `NEXT_PUBLIC_COGNITO_*` (from auth stack), `NEXT_PUBLIC_APP_URL`, `AWS_REGION`, infrastructure refs for provisioning (cluster ARN, namespace ID, EFS ID, squadhub image, SG IDs, subnet IDs) + - IAM role: ECS provisioning, EFS access points, CloudMap service discovery, IAM PassRole + - Logging: `/clawe/web-{env}` (30d dev, 90d prod) +- Service: + - Desired count: 1 (dev) / 2 (prod) + - Circuit breaker with auto-rollback + - MaximumPercent: 200%, MinimumHealthyPercent: 50% + - Health check grace period: 120s + - ALB target group attachment + - [ ] Auto-scaling: target CPU 70%, min 1 / max 10 (prod only) — **to add** + +### 6.4 Watcher Fargate service + +- [x] ECS service for the shared watcher (in `shared-services-stack.ts`): + +- Task definition: + - Image: `fromEcrRepository(watcherRepo, imageTag)` — same context as web + - CPU: 256, Memory: 512 + - Environment: `CONVEX_URL` (from context), `ENVIRONMENT` + - Secrets: `WATCHER_TOKEN` from SSM (`/clawe/{env}/watcher-token`) + - IAM role: CloudMap DiscoverInstances + - Logging: `/clawe/watcher-{env}` (30d dev, 90d prod) +- Service: + - Desired count: 1 + - Circuit breaker with auto-rollback + - No ALB attachment (internal service only) + +--- + +## Phase 7: Per-Tenant Squadhub (CDK + Runtime) + +### 7.1 Base squadhub task definition (shared infrastructure in CDK) + +Phase 7.1 creates the **shared CDK infrastructure** that all per-tenant squadhub services reference. The actual per-tenant ECS task definitions and services are created at **runtime** by CloudSquadhubProvisioner (Phase 8). + +Changes in `packages/infra/lib/shared-services-stack.ts`: + +- [x] Shared squadhub execution role (`clawe-squadhub-exec-{env}`) — ECS Fargate execution role for ECR pulls + CloudWatch log writes, with `AmazonECSTaskExecutionRolePolicy` +- [x] Shared squadhub task role (`clawe-squadhub-task-{env}`) — runtime role with EFS mount/write/root access (scoped to file system ARN) + CloudMap `DiscoverInstances` +- [x] Shared squadhub log group (`/clawe/squadhub-{env}`) — tenant-specific stream prefixes set at runtime by CloudSquadhubProvisioner (30d dev, 90d prod retention) +- [x] Export role ARNs + log group to web container env vars: `SQUADHUB_EXECUTION_ROLE_ARN`, `SQUADHUB_TASK_ROLE_ARN`, `SQUADHUB_LOG_GROUP` +- [x] Tightened `iam:PassRole` scope from `resources: ["*"]` to the two specific role ARNs + +Per-tenant ECS task definition template (created at runtime by CloudSquadhubProvisioner in Phase 8): + +- Image: `{account}.dkr.ecr.{region}.amazonaws.com/clawe/squadhub:{env}-latest` +- CPU: 256 (0.25 vCPU) +- Memory: 512 (0.5 GB) +- Port: 18789 +- EFS volume mount: `/data` (access point set per-tenant at runtime) +- Health check: `wget -q --spider http://localhost:18789/health` +- IAM task role: shared `clawe-squadhub-task-{env}` (EFS + CloudMap) +- IAM execution role: shared `clawe-squadhub-exec-{env}` (ECR + logs) +- Logging: shared `/clawe/squadhub-{env}` log group with tenant-specific stream prefix + +### 7.2 CloudMap service discovery + +All CDK infrastructure for CloudMap is already in place (created in Phase 5.1 + 7.1). Per-tenant service registration happens at runtime via CloudSquadhubProvisioner (Phase 8.2). + +- [x] CloudMap private DNS namespace (`clawe-internal-{env}`) — created in shared-services-stack +- [x] Namespace ID + name exported to web container (`CLOUDMAP_NAMESPACE_ID`, `CLOUDMAP_NAMESPACE_NAME`) +- [x] Web app task role has full CloudMap permissions (`CreateService`, `DeleteService`, `RegisterInstance`, `DeregisterInstance`, `DiscoverInstances`, `GetService`, `ListInstances`) +- [x] Watcher task role has `DiscoverInstances` permission +- [x] Squadhub task role has `DiscoverInstances` permission (added in 7.1) + +Runtime (Phase 8.2): CloudSquadhubProvisioner creates per-tenant CloudMap service entries (`squadhub-{tenantId}.clawe-internal-{env}`) and associates with ECS services. Web app resolves `squadhub-{tenantId}.clawe-internal-{env}` → private IP:18789. + +### 7.3 Squadhub lifecycle API routes + +These routes use `getPlugin("squadhub-lifecycle")` from `@clawe/plugins`. In dev, they return no-ops. In cloud, `CloudSquadhubLifecycle` manages ECS services. + +- [x] `POST /api/tenant/squadhub/restart` — calls `getPlugin("squadhub-lifecycle").restart(tenantId)` +- [x] `GET /api/tenant/squadhub/status` — calls `getPlugin("squadhub-lifecycle").getStatus(tenantId)` +- [x] `DELETE /api/tenant/squadhub` — calls `getPlugin("squadhub-lifecycle").destroy(tenantId)` +- [x] Extracted shared `getAuthenticatedTenant()` helper (`lib/api/tenant-auth.ts`) — reads JWT from Authorization header, resolves tenant via Convex + +These routes live in the **base repo** and work for both dev and cloud via the plugin interface. + +--- + +## Phase 8: Cloud Tenant Provisioning + +### 8.1 Cloud plugins package (`packages/cloud-plugins/`) + +Create `@clawe/cloud-plugins` in the `clawe-cloud` repo. This package implements the `@clawe/plugins` interfaces (from Phase 4.1) with real AWS infrastructure. + +**Package structure (in `clawe-cloud` repo):** + +``` +packages/cloud-plugins/ +├── src/ +│ ├── index.ts # register() function — returns PluginMap +│ ├── squadhub-provisioner.ts # CloudSquadhubProvisioner implements SquadhubProvisioner +│ └── squadhub-lifecycle.ts # CloudSquadhubLifecycle implements SquadhubLifecycle +├── package.json # depends on @clawe/plugins (for types), aws-sdk +└── tsconfig.json +``` + +**Registration:** + +```typescript +// cloud-plugins/src/index.ts +import type { PluginMap } from "@clawe/plugins"; +import { CloudSquadhubProvisioner } from "./squadhub-provisioner"; +import { CloudSquadhubLifecycle } from "./squadhub-lifecycle"; + +export function register(): PluginMap { + return { + "squadhub-provisioner": new CloudSquadhubProvisioner(), + "squadhub-lifecycle": new CloudSquadhubLifecycle(), + }; +} +``` + +When `@clawe/cloud-plugins` is installed and `loadPlugins()` runs, the base provision route (Phase 4.2) automatically uses `CloudSquadhubProvisioner` instead of `DefaultSquadhubProvisioner`. No route file changes needed. + +- [x] Create `packages/cloud-plugins/` with `CloudSquadhubProvisioner` and `CloudSquadhubLifecycle` (stub implementations, `throw new Error` for each method) +- [x] Add `@clawe/plugins` as dependency (for interfaces) +- [x] Add `@aws-sdk/client-ecs`, `@aws-sdk/client-efs`, `@aws-sdk/client-servicediscovery` as dependencies +- [x] Already covered by `packages/*` glob in `pnpm-workspace.yaml` +- [x] Add `@clawe/cloud-plugins: "workspace:*"` to `apps/web/package.json` in `clawe-cloud` — this is the single line that activates cloud plugins (without it, `loadPlugins()` falls back to defaults) + +### 8.2 CloudSquadhubProvisioner implementation + +Implement `SquadhubProvisioner` interface with AWS infrastructure: + +**`provision()`:** + +- [x] 1. Create EFS access point for tenant (path: `/tenants/{tenantId}`) +- [x] 2. Register ECS task definition (from base template, with tenant-specific env vars): + - `SQUADHUB_TOKEN` — generated per-tenant UUID + - API keys patched via `patchConfig` after provisioning (from Convex tenant record) + - `CONVEX_URL` — from `params.convexUrl` + - `OPENCLAW_STATE_DIR=/data/config` + - `OPENCLAW_PORT=18789` + - `OPENCLAW_SKIP_GMAIL_WATCHER=1` +- [x] 3. Create ECS Service on `clawe-squadhubs-{env}` cluster with `desiredCount: 1` +- [x] 4. Associate with CloudMap service discovery (`squadhub-{tenantId}.clawe-internal-{env}`) +- [x] 5. Return `{ squadhubUrl, squadhubToken, metadata: { serviceArn, efsAccessPointId } }` + +**`getProvisioningStatus()`:** + +- [x] Query ECS Service running count and task health +- [x] Return `"provisioning"` while tasks are starting, `"active"` when healthy, `"error"` on failure + +**`deprovision(params: DeprovisionParams)`:** + +- [x] Delete ECS Service (force delete, stops running tasks) +- [x] Deregister task definition +- [x] Deregister CloudMap service entry +- [x] Delete EFS access point (via `params.metadata.efsAccessPointId`) +- [x] Best-effort cleanup — continues through all steps, collects errors, throws aggregate error + +**Error handling:** + +- [x] `provision()` uses compensating transactions — on failure, rolls back all created resources (best-effort) +- [x] `deprovision()` uses best-effort pattern — attempts all cleanup steps, throws aggregate error if any non-404 failures + +Environment variables for the squadhub ECS task: + +``` +SQUADHUB_TOKEN={generated-per-tenant} # Used for both: squadhub HTTP auth AND Convex machine auth +CONVEX_URL={env-specific-convex-url} +OPENCLAW_STATE_DIR=/data/config +OPENCLAW_PORT=18789 +OPENCLAW_SKIP_GMAIL_WATCHER=1 +# API keys (Anthropic, OpenAI) are patched into squadhub config via patchConfig after provisioning +``` + +Note: `TENANT_ID` is not needed as an env var — the CLI resolves the tenant by passing `SQUADHUB_TOKEN` as `machineToken` to Convex, which looks up the tenant via the `by_squadhubToken` index. + +**Important: `initProcessEnabled` must be set to `true`** in the ECS container definition's `linuxParameters`. OpenClaw does a "full process restart" on certain config changes (e.g. adding a Telegram channel) — it spawns a child process and the parent exits. Without an init process (tini), PID 1 exits and the container/task dies. The Dockerfile includes `tini` in the ENTRYPOINT, but ECS should also have `initProcessEnabled: true` as a safety net. + +### 8.3 CloudSquadhubLifecycle implementation + +Implement `SquadhubLifecycle` interface with ECS operations: + +**`restart(tenantId)`:** + +- [x] Force new deployment on the tenant's ECS Service via `UpdateServiceCommand` with `forceNewDeployment: true` + +**`stop(tenantId)`:** + +- [x] Set ECS Service `desiredCount: 0` via `UpdateServiceCommand` + +**`destroy(tenantId)`:** + +- [x] Delete ECS Service (force) + deregister task definition +- [x] Delete EFS access point (looked up by `clawe:tenantId` tag) +- [x] Delete CloudMap service entry +- [x] Best-effort cleanup — continues through all steps, throws aggregate error + +**`getStatus(tenantId)`:** + +- [x] Check ECS Service running count via `DescribeServicesCommand` +- [x] Check task-level health via `ListTasksCommand` + `DescribeTasksCommand` +- [x] Return `{ running: runningCount > 0, healthy: container.healthStatus === "HEALTHY" }` + +**Refactoring:** + +- [x] Extracted shared utilities (`getEnvConfig`, `serviceName`, `isNotFoundError`, `toError`, AWS clients) to `aws-config.ts` +- [x] Updated `squadhub-provisioner.ts` to import from `aws-config.ts` + +### 8.4 Provisioning status polling + +Create `apps/web/src/app/api/tenant/status/route.ts`: + +- [x] Calls `getPlugin("squadhub-provisioner").getProvisioningStatus(tenantId)` +- [x] Returns status to client for polling +- [x] Works for both dev (instant "active") and cloud (real status) + +### 8.5 Cloud provisioning UI flow + +**Note**: The shared `/setup/provisioning` screen (Phase 4.7) already handles the full flow — loading state, error handling, and redirect. This step just ensures the cloud polling path works end-to-end: + +- [x] Verify `GET /api/tenant/status` (8.4) returns real ECS status via `CloudSquadhubProvisioner.getProvisioningStatus()` +- [x] Verify `/setup/provisioning` page polls status while `"provisioning"` and shows progress messages ("Creating workspace...", "Starting services...", "Almost ready...") +- [x] Verify redirect to `/setup/welcome` once status is `"active"` +- [x] No new UI page needed — reuses the shared screen from 4.7 + +### 8.6 Dynamic squadhub routing + +Currently the web app reads `SQUADHUB_URL` from env. This needs to become per-tenant: + +- [x] Update `apps/web/src/lib/squadhub/connection.ts`: + - `getConnection(tenant?)` — uses tenant's `squadhubUrl` + `squadhubToken` if provided + - Falls back to env vars when no tenant (self-hosted / dev) + - No cache needed — tenant data comes from the same `getAuthenticatedTenant()` query used for auth + +- [x] Update all API routes that call the squadhub (already refactored in Phase 1 to accept params): + - `/api/chat` — resolve tenant's squadhub URL before calling + - `/api/chat/history` — same + - `/api/chat/abort` — same + - `/api/squadhub/health` — same + - `/api/squadhub/pairing` — same + +- [x] Refactor `getAuthenticatedTenant` return type to clean discriminated union (`{ error, convex, tenant }`) + +### 8.7 API Key Management + +API keys (Anthropic required, OpenAI optional) are stored in the Convex tenant record — the single source of truth. Removed from env vars entirely. + +Update onboarding flow (4 → 5 steps): + +- [x] New `/setup/api-keys` step (2/5) between Welcome and Business +- [x] Anthropic API key input (required, validated server-side via `/api/tenant/validate-key`) +- [x] OpenAI API key input (optional, validated server-side) +- [x] Store keys in Convex via `tenants.setApiKeys()` +- [x] Update Welcome page: navigate to `/setup/api-keys`, add API key to "You'll need" list +- [x] Update step numbers: Business (3/5), Telegram (4/5), Complete (5/5) + +Server-side key validation: + +- [x] `POST /api/tenant/validate-key` — validates Anthropic/OpenAI keys server-side + +Provision route integration: + +- [x] After provisioning, read keys from tenant record and patch into squadhub config via `patchApiKeys()` +- [x] `patchApiKeys()` server action calls `patchConfig()` to set auth profiles + +Settings page: + +- [x] Add "API Keys" section to `/settings/general` (between General and Timezone) +- [x] Query `getApiKeys()` for masked display +- [x] Validate + save new keys via `setApiKeys()` mutation +- [x] In cloud (`hasPlugin()`): call `/api/tenant/squadhub/restart` after save +- [x] In dev: show info message "Restart your squadhub container to apply changes" + +Env var cleanup: + +- [x] Remove `ANTHROPIC_API_KEY` and `OPENAI_API_KEY` from `docker-compose.yml` +- [x] Remove from `.env.example`, `turbo.json`, `scripts/start.sh` +- [x] Update `CLAUDE.md`, `apps/web/CLAUDE.md`, `README.md` documentation + +### 8.8 Squadhub extensions package (`packages/squadhub-extensions/`) + +**Problem**: The pairing flow (`listChannelPairingRequests` / `approveChannelPairingCode` in `packages/shared/src/squadhub/pairing.ts`) reads/writes JSON files on the squadhub's local filesystem (`~/.squadhub/credentials/-pairing.json`). In cloud, the web app container can't access the squadhub container's filesystem. + +**Solution**: Create an OpenClaw plugin package that runs inside the squadhub container (in-process with the Gateway). It has direct filesystem access and exposes operations as tools callable via the existing `POST /tools/invoke` HTTP API — same mechanism used for `getConfig`, `patchConfig`, `sessions_list`, etc. + +The package is a general-purpose home for all Clawe-specific OpenClaw extensions. Each tool lives in its own file under `tools/`. Adding a new tool = add a file + import it in `index.ts`. + +OpenClaw uses [jiti](https://github.com/unjs/jiti) to load plugins, so it can load raw `.ts` files directly. We still use tsup to compile to JS for CI validation (type checking + build verification). + +**Package structure:** + +``` +packages/squadhub-extensions/ +├── src/ +│ ├── index.ts # Plugin entry: imports and registers all tools +│ └── tools/ +│ └── pairing.ts # clawe_pairing tool (list + approve actions) +├── openclaw.plugin.json # OpenClaw plugin manifest +├── package.json +├── tsconfig.json +└── tsup.config.ts # Bundles to single JS file (like CLI) +``` + +**Plugin manifest** (`openclaw.plugin.json`): + +```json +{ + "id": "clawe", + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} +``` + +**Plugin entry** (`src/index.ts`): + +```typescript +import type { OpenClawPluginApi } from "openclaw/plugin-sdk"; +import { registerPairingTool } from "./tools/pairing"; + +export default function register(api: OpenClawPluginApi) { + registerPairingTool(api); + // Future tools registered here +} +``` + +**Pairing tool** (`src/tools/pairing.ts`): + +Registers a `clawe_pairing` tool with two actions: + +- `list` — reads `-pairing.json` from the squadhub state dir, prunes expired requests, returns pending list +- `approve` — finds matching code, reads config via gateway API (`getConfig`), updates `allowFrom` via `patchConfig`, removes request from pairing file + +The tool replicates the logic from `packages/shared/src/squadhub/pairing.ts` but runs inside the squadhub container where it has filesystem access. + +**Build** (`tsup.config.ts`): + +```typescript +export default defineConfig({ + entry: { index: "src/index.ts" }, + format: ["esm"], + target: "node22", + platform: "node", + outDir: "dist", + clean: true, + bundle: true, + noExternal: [/.*/], +}); +``` + +**Package scripts** (`package.json`): + +```json +{ + "name": "@clawe/squadhub-extensions", + "scripts": { + "build": "tsup", + "check-types": "tsc --noEmit" + } +} +``` + +**Dockerfile changes** (`docker/squadhub/Dockerfile`): + +```dockerfile +# Copy Clawe OpenClaw extensions (plugin bundle + manifest) +COPY packages/squadhub-extensions/dist/index.js /opt/clawe/extensions/clawe/dist/index.js +COPY packages/squadhub-extensions/openclaw.plugin.json /opt/clawe/extensions/clawe/openclaw.plugin.json +COPY packages/squadhub-extensions/package.json /opt/clawe/extensions/clawe/package.json +``` + +**Config template changes** (`docker/squadhub/templates/config.template.json`): + +Add plugin loading config (points directly to the static path, no entrypoint changes needed): + +```json +{ + "plugins": { + "load": { + "paths": ["/opt/clawe/extensions/clawe"] + } + } +} +``` + +**Web app changes** — update pairing route and shared client: + +1. Add pairing tool functions to `packages/shared/src/squadhub/client.ts`: + - `listPairingRequests(connection, channel)` — calls `invokeTool(connection, "clawe_pairing", "list", { channel })` + - `approvePairingCode(connection, channel, code)` — calls `invokeTool(connection, "clawe_pairing", "approve", { channel, code })` + +2. Update `apps/web/src/app/api/squadhub/pairing/route.ts`: + - GET: replace `listChannelPairingRequests(channel)` (filesystem) with `listPairingRequests(getConnection(auth.tenant), channel)` (HTTP tool invoke) + - POST: replace `approveChannelPairingCode(connection, channel, code)` (filesystem + HTTP) with `approvePairingCode(connection, channel, code)` (HTTP tool invoke only) + +3. The old filesystem-based functions in `packages/shared/src/squadhub/pairing.ts` can be removed — the web app no longer calls them directly. + +**Implementation tasks:** + +- [x] Create `packages/squadhub-extensions/` package with `package.json`, `tsconfig.json`, `tsup.config.ts` +- [x] Implement `openclaw.plugin.json` manifest +- [x] Implement `src/index.ts` — plugin entry that registers all tools +- [x] Implement `src/tools/pairing.ts` — `clawe_pairing` tool with `list` and `approve` actions +- [x] Add `listPairingRequests` and `approvePairingCode` to `packages/shared/src/squadhub/client.ts` (invoke via `/tools/invoke`) +- [x] Update `apps/web/src/app/api/squadhub/pairing/route.ts` to use new client functions instead of filesystem-based pairing +- [x] Update `docker/squadhub/Dockerfile` — copy built extensions to extensions directory +- [x] Update `docker/squadhub/templates/config.template.json` — add `plugins.load.paths` +- [x] Build passes: `pnpm --filter @clawe/squadhub-extensions build` +- [x] Type check passes: `pnpm --filter @clawe/squadhub-extensions check-types` +- [x] Remove old filesystem-based pairing functions from `packages/shared/src/squadhub/pairing.ts` + +**Verification:** + +1. `pnpm --filter @clawe/squadhub-extensions build` — bundles to single JS file in `dist/` +2. `pnpm --filter @clawe/squadhub-extensions check-types` — clean +3. Docker build includes plugin in `/opt/clawe/extensions/clawe/` +4. Squadhub gateway starts with plugin loaded (check logs for plugin registration) +5. `POST /tools/invoke { tool: "clawe_pairing", action: "list", args: { channel: "telegram" } }` returns pairing requests +6. End-to-end: Telegram pairing flow works via the plugin (list → approve → config updated) + +--- + +## Phase 9: Cloud Onboarding Flow Updates + +### 9.1 New onboarding step order + +``` +1. Sign in with Google (Cognito OAuth redirect) +2. Provisioning (auto — create tenant + start squadhub) +3. Anthropic API Key (new step — enter API key, stored in Convex tenant record) +4. Business Context (existing — chat with Clawe agent) +5. Telegram (optional) (existing — bot token + pairing) +6. Complete (existing) +``` + +### 9.2 API Key onboarding step + +**Completed in Phase 8.7** — `/setup/api-keys` page handles both Anthropic (required) and OpenAI (optional) keys with server-side validation. Works for all modes (dev + cloud). + +### 9.3 Settings — API Key management + +**Completed in Phase 8.7** — API Keys section in `/settings/general` with masked display, validation, and cloud restart support. + +--- + +## Phase 10: DNS & CDN (CDK) + +All of Phase 10 was implemented in `dns-stack.ts` during Phase 5.1, updated in 5.8 to use shared wildcard cert. + +### 10.1 Hosted Zone lookup + +- [x] Look up existing Route53 hosted zone for `clawe.io` via `HostedZone.fromLookup` + +### 10.2 ACM Certificate + +- [x] Uses shared wildcard cert from Phase 5.8 (`*.clawe.io` + `clawe.io`), passed as prop to dns-stack + +### 10.3 CloudFront distribution + +- [x] Created in `dns-stack.ts`: + - Origin: ALB via HTTPS (wildcard cert on ALB) + - Alternate domain: `app.clawe.io` / `app-dev.clawe.io` + - Certificate: shared wildcard cert + - Cache policy: caching disabled (dynamic Next.js app) + - Origin request policy: ALL_VIEWER (forwards all headers/cookies) + - Allowed methods: ALL (GET, POST, PUT, DELETE, etc.) + - Exports `DistributionId` (used by deploy workflow for cache invalidation) + +### 10.4 Route53 record + +- [x] A record (alias) pointing `app.clawe.io` / `app-dev.clawe.io` to CloudFront distribution + +--- + +## Phase 11: Docker Images & CI/CD + +CI/CD foundation was set up in Phase 5.9–5.10. Image builds use `docker/build-push-action` with GitHub Actions layer caching. + +### 11.1 Image tagging convention + +- [x] Tagging implemented in deploy workflow (Phase 5.10): + - `clawe/web:{env}-{git-sha}` + `clawe/web:{env}-latest` + - `clawe/watcher:{env}-{git-sha}` + `clawe/watcher:{env}-latest` + - `clawe/squadhub:{env}-{git-sha}` + `clawe/squadhub:{env}-latest` +- [x] Uses `docker/metadata-action@v5` for tag/label generation +- [x] Uses `docker/build-push-action@v6` with `docker/setup-buildx-action@v3` +- [x] GitHub Actions layer caching (`cache-from/cache-to: type=gha`) per image scope +- [x] Per-tenant squadhub rolling update via `aws ecs update-service --force-new-deployment` on all services in squadhubs cluster + +### 11.2 Refine deploy workflow (future optimization) + +- [ ] Use `turbo prune --docker` in Dockerfiles for smaller images and faster builds +- [ ] Parallel jobs: build web/watcher/squadhub images in parallel +- [ ] Selective builds: only rebuild images whose source changed (based on git diff) +- [ ] Post-deploy Slack notification (optional) + +--- + +## Phase 12: Monitoring (CDK) + +Basic monitoring created in `monitoring-stack.ts` during Phase 5.1. Log groups created in `shared-services-stack.ts`. + +### 12.1 CloudWatch alarms + +- [x] ALB 5xx error rate alarm (threshold > 5 in 5 minutes) — in `monitoring-stack.ts` +- [ ] Web app: CPU > 80%, unhealthy targets +- [ ] Squadhub services: task stopped unexpectedly, health check failures +- [ ] Watcher: task stopped, log errors +- [ ] EFS: burst credit balance low + +### 12.2 CloudWatch dashboard + +- [x] ALB request count widget — in `monitoring-stack.ts` +- [x] ALB 5xx errors widget — in `monitoring-stack.ts` +- [ ] Active tenants count +- [ ] Squadhub services running / restarting +- [ ] Web app latency +- [ ] EFS usage + +### 12.3 Log groups + +- [x] `/clawe/web-{env}` — web app logs (30d dev, 90d prod) — in `shared-services-stack.ts` +- [x] `/clawe/watcher-{env}` — watcher logs (30d dev, 90d prod) — in `shared-services-stack.ts` +- [x] `/clawe/squadhub-{env}` — shared squadhub log group with per-tenant stream prefixes (Phase 7.1, 30d dev, 90d prod) + +--- + +## Implementation Order + +``` +Phase 1 (Shared client & watcher refactor) — ✅ COMPLETE + │ +Phase 2 (Multi-tenancy in Convex) — ✅ COMPLETE + │ +Phase 3 (Authentication) — ✅ COMPLETE + │ +Phase 4 (Dev & self-hosted provisioning) — ✅ COMPLETE + │ includes: @clawe/plugins package (interfaces + registry + dev defaults) + ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ + │ ↑ Dev & self-hosted complete here (base repo: clawe) + │ ↓ Everything below is cloud-only (clawe-cloud repo) + │ +Phase 5 (CDK setup + base infra) — ✅ COMPLETE (all stacks + CI/CD + SSM params) + │ +Phase 6 (Shared services CDK) — ✅ COMPLETE (done in 5.1, auto-scaling pending) + │ +Phase 7 (Per-tenant squadhub CDK) — NEXT + │ +Phase 8 (Cloud tenant provisioning) — @clawe/cloud-plugins + AWS provisioning + │ includes: CloudSquadhubProvisioner + CloudSquadhubLifecycle + @clawe/squadhub-extensions + │ 8.7 API Key Management: ✅ COMPLETE (onboarding step, settings UI, env var cleanup) + │ +Phase 9 (Cloud onboarding updates) — 9.2/9.3 completed in Phase 8.7 + │ +Phase 10 (DNS & CDN) — ✅ COMPLETE (done in 5.1) + │ +Phase 11 (CI/CD) — ✅ COMPLETE (deploy workflow + image builds done in 5.10) + │ +Phase 12 (Monitoring) — PARTIAL (basic dashboard + 5xx alarm + log groups done) +``` + +Phases 1-4: complete (base repo: clawe). +Phases 5, 6, 10, 11: complete (clawe-cloud repo) — all CDK stacks, CI/CD, DNS/CDN. +Phase 12: partial — basic monitoring done, advanced alarms/dashboards deferred. +**Next: Phase 7** (per-tenant squadhub infrastructure) → Phase 8 (cloud provisioning) → Phase 9 (onboarding). + +--- + +## Files to Create + +### Base repo (clawe) + +``` +packages/plugins/ (new — Phase 4.1) +├── src/ +│ ├── index.ts # Main exports: loadPlugins, hasPlugin, getPlugin +│ ├── registry.ts # Plugin registry internals +│ ├── interfaces/ +│ │ ├── index.ts # Re-exports all interfaces +│ │ ├── squadhub-provisioner.ts # SquadhubProvisioner interface +│ │ └── squadhub-lifecycle.ts # SquadhubLifecycle interface +│ └── defaults/ +│ ├── index.ts # Dev default implementations +│ ├── squadhub-provisioner.ts # DefaultSquadhubProvisioner (reads env vars) +│ └── squadhub-lifecycle.ts # DefaultSquadhubLifecycle (no-ops) +├── package.json +└── tsconfig.json + +apps/web/src/ +├── proxy.ts (new — renamed from middleware.ts) +├── app/auth/ +│ └── login/page.tsx (new — "Sign in with Google" button) +├── app/api/tenant/ +│ ├── provision/route.ts (new) +│ ├── status/route.ts (new — uses getPlugin("squadhub-provisioner")) +│ └── squadhub/ +│ ├── restart/route.ts (new — uses getPlugin("squadhub-lifecycle")) +│ └── status/route.ts (new — uses getPlugin("squadhub-lifecycle")) +├── providers/auth-provider.tsx (new) +├── lib/squadhub/connection.ts (updated — tenant-aware getConnection) +└── hooks/use-auth.ts (new) + +packages/backend/convex/ +├── auth.config.cognito.ts (new — Cognito auth provider template) +├── auth.config.nextauth.ts (new — NextAuth customJwt provider) +├── auth.config.ts (generated — copied from template at deploy time) +├── users.ts (new) +├── tenants.ts (new) +├── accounts.ts (new — account management) +├── lib/auth.ts (new) +├── schema.ts (modify — add tenantId everywhere) +├── agents.ts (modify — tenant filtering) +├── tasks.ts (modify — tenant filtering) +├── messages.ts (modify — tenant filtering) +├── notifications.ts (modify — tenant filtering) +├── activities.ts (modify — tenant filtering) +├── documents.ts (modify — tenant filtering) +├── settings.ts (deleted — migrated to tenants.ts) +├── businessContext.ts (modify — tenant filtering) +├── channels.ts (modify — tenant filtering) +└── routines.ts (modify — tenant filtering) + +packages/squadhub-extensions/ (new — Phase 8.8) +├── src/ +│ ├── index.ts # Plugin entry: registers all tools +│ └── tools/ +│ └── pairing.ts # clawe_pairing tool +├── openclaw.plugin.json # Plugin manifest (id: "clawe") +├── package.json +├── tsconfig.json +└── tsup.config.ts # Bundles to single JS file + +packages/shared/src/squadhub/ +├── client.ts (modify — accept url/token params, add pairing tool invocations) +└── gateway-client.ts (modify — accept url/token params) + +packages/cli/src/ +├── client.ts (modify — add machineToken to all calls) +└── commands/*.ts (modify — pass machineToken) + +apps/watcher/src/ +├── index.ts (modify — multi-tenant loop) +└── config.ts (modify — remove single-tenant env vars) + +apps/web/src/lib/squadhub/ +└── actions.ts (modify — per-tenant routing) +``` + +### Cloud repo (clawe-cloud) — only new files, no base file modifications + +``` +packages/cloud-plugins/ (new — Phase 8.1) +├── src/ +│ ├── index.ts # register() → PluginMap with cloud impls +│ ├── squadhub-provisioner.ts # CloudSquadhubProvisioner implements SquadhubProvisioner +│ └── squadhub-lifecycle.ts # CloudSquadhubLifecycle implements SquadhubLifecycle +├── package.json # depends on @clawe/plugins, @aws-sdk/* +└── tsconfig.json + +packages/infra/ (new — Phase 5+) +├── bin/clawe.ts +├── lib/ +│ ├── config.ts +│ ├── certificate-stack.ts # Shared wildcard cert (Phase 5.8) +│ ├── ci-stack.ts # GitHub OIDC + CI role (Phase 5.9) +│ ├── networking-stack.ts +│ ├── auth-stack.ts +│ ├── storage-stack.ts +│ ├── shared-services-stack.ts +│ ├── tenant-stack.ts +│ ├── dns-stack.ts +│ └── monitoring-stack.ts +├── scripts/ +│ └── build-and-push.sh +├── cdk.json +├── tsconfig.json +└── package.json + +apps/web/src/app/setup/ +└── api-keys/page.tsx (new — Phase 8.7, API key onboarding step) + +apps/web/src/app/api/tenant/ +└── validate-key/route.ts (new — Phase 8.7, server-side key validation) + +apps/web/src/app/(dashboard)/settings/general/_components/ +└── api-keys-settings.tsx (new — Phase 8.7, settings API key management) + +.github/workflows/ +└── deploy.yml (new — Phase 5.10) +``` + +## Files to Modify + +### Base repo + +``` +apps/web/src/app/layout.tsx (add auth provider) +apps/web/src/app/(dashboard)/layout.tsx (add auth guard) +apps/web/src/app/setup/welcome/page.tsx (update step flow — 1/5, nav to /setup/api-keys) +apps/web/src/app/setup/business/page.tsx (step 3/5) +apps/web/src/app/setup/telegram/page.tsx (step 4/5) +apps/web/src/app/setup/complete/page.tsx (step 5/5) +apps/web/src/app/(dashboard)/settings/general/page.tsx (add ApiKeysSettings section) +apps/web/src/lib/squadhub/actions.ts (add patchApiKeys server action) +apps/web/package.json (add @clawe/plugins, auth deps) +apps/web/src/lib/squadhub/connection.ts (use getPlugin("squadhub-provisioner") result instead of env vars) +apps/web/src/app/api/tenant/provision/route.ts (use getPlugin + patch API keys after provisioning) +apps/web/src/app/api/squadhub/pairing/route.ts (use invokeTool instead of filesystem — Phase 8.8) +docker-compose.yml (remove ANTHROPIC_API_KEY, OPENAI_API_KEY from squadhub) +.env.example (remove API key entries, add UI management note) +turbo.json (remove API keys from globalPassThroughEnv) +scripts/start.sh (remove API key validation checks) +packages/backend/convex/schema.ts (add openaiApiKey to tenants) +packages/backend/convex/tenants.ts (setApiKeys mutation, getApiKeys query) +packages/shared/src/squadhub/pairing.ts (remove filesystem-based functions — Phase 8.8) +docker/squadhub/Dockerfile (copy extensions to extensions dir — Phase 8.8) +docker/squadhub/templates/config.template.json (add plugins.load.paths — Phase 8.8) +pnpm-workspace.yaml (add packages/plugins) +package.json (add workspace scripts) +``` + +### Cloud repo + +``` +apps/web/package.json (add @clawe/cloud-plugins) +apps/watcher/package.json (add aws-sdk for CloudMap) +docker/squadhub/entrypoint.sh (add agent registration on first start) +pnpm-workspace.yaml (add packages/cloud-plugins, infra/) +package.json (add infra scripts) +``` diff --git a/turbo.json b/turbo.json index d25f609..79ee0ef 100644 --- a/turbo.json +++ b/turbo.json @@ -4,12 +4,11 @@ "globalEnv": ["ENVIRONMENT", "CLAWE_EDITION"], "globalPassThroughEnv": [ "CONVEX_URL", - "ANTHROPIC_API_KEY", - "OPENAI_API_KEY", "SQUADHUB_URL", "SQUADHUB_TOKEN", "CLAWE_DATA_DIR", "SQUADHUB_STATE_DIR", + "CLAWE_EDITION", "AUTH_PROVIDER", "NEXTAUTH_SECRET", "NEXTAUTH_URL",