diff --git a/go.mod b/go.mod index 35f452639c..abc9949ec0 100644 --- a/go.mod +++ b/go.mod @@ -110,6 +110,7 @@ require ( github.com/golang-jwt/jwt/v5 v5.3.0 // indirect github.com/golang/glog v1.2.4 // indirect github.com/golang/protobuf v1.5.4 // indirect + github.com/gomarkdown/markdown v0.0.0-20260217112301-37c66b85d6ab // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/go-querystring v1.1.0 // indirect @@ -132,7 +133,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect - github.com/microcosm-cc/bluemonday v1.0.26 // indirect + github.com/microcosm-cc/bluemonday v1.0.27 // indirect github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect diff --git a/go.sum b/go.sum index 09f10eb945..2088a9f6c1 100644 --- a/go.sum +++ b/go.sum @@ -151,6 +151,8 @@ github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUv github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/gomarkdown/markdown v0.0.0-20260217112301-37c66b85d6ab h1:VYNivV7P8IRHUam2swVUNkhIdp0LRRFKe4hXNnoZKTc= +github.com/gomarkdown/markdown v0.0.0-20260217112301-37c66b85d6ab/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA= github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= @@ -311,6 +313,8 @@ github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3v github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58= github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs= +github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk= +github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= diff --git a/internal/analysisengine/engine.go b/internal/analysisengine/engine.go index 577b3d33de..01e9293476 100644 --- a/internal/analysisengine/engine.go +++ b/internal/analysisengine/engine.go @@ -29,6 +29,8 @@ type ClusterInfo struct { Region string CloudProvider string Version string + Type string // e.g. "rosa", "osd", "aro" + Hypershift bool } // Config holds configuration for the analysis engine. diff --git a/internal/prompts/prompts.go b/internal/prompts/prompts.go index c1dfc323b2..fafa2f8c66 100644 --- a/internal/prompts/prompts.go +++ b/internal/prompts/prompts.go @@ -71,6 +71,12 @@ func (ps *PromptStore) loadTemplates(filesystem fs.FS) error { }) } +// RegisterTemplates loads additional templates from the given filesystem, +// overwriting any existing templates with the same ID. +func (ps *PromptStore) RegisterTemplates(templatesFS fs.FS) error { + return ps.loadTemplates(templatesFS) +} + func (ps *PromptStore) GetTemplate(id string) (*PromptTemplate, error) { template, exists := ps.templates[id] if !exists { diff --git a/internal/prompts/prompts_test.go b/internal/prompts/prompts_test.go index f1565da42e..7be494358d 100644 --- a/internal/prompts/prompts_test.go +++ b/internal/prompts/prompts_test.go @@ -2,6 +2,7 @@ package prompts import ( "testing" + "testing/fstest" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -12,6 +13,33 @@ func TestNewPromptStore(t *testing.T) { require.NoError(t, err) require.NotNil(t, store) assert.Greater(t, len(store.templates), 0, "Should have loaded some templates") + + _, err = store.GetTemplate("default") + assert.NoError(t, err, "default template should be loaded") +} + +func TestRegisterTemplates_OverwritesExisting(t *testing.T) { + store, err := NewPromptStore(DefaultTemplates()) + require.NoError(t, err) + + original, err := store.GetTemplate("default") + require.NoError(t, err) + require.NotNil(t, original) + + replacementYAML := `system_prompt: "replacement system prompt" +user_prompt: "replacement user prompt" +` + overrideFS := fstest.MapFS{ + "default.yaml": &fstest.MapFile{Data: []byte(replacementYAML)}, + } + + err = store.RegisterTemplates(overrideFS) + require.NoError(t, err) + + updated, err := store.GetTemplate("default") + require.NoError(t, err) + assert.Equal(t, "replacement system prompt", updated.SystemPrompt) + assert.Equal(t, "replacement user prompt", updated.UserPrompt) } func TestGetTemplate(t *testing.T) { diff --git a/pkg/krknai/aggregator/aggregator.go b/pkg/krknai/aggregator/aggregator.go index d06b1c4f6c..4cd30437fb 100644 --- a/pkg/krknai/aggregator/aggregator.go +++ b/pkg/krknai/aggregator/aggregator.go @@ -15,6 +15,15 @@ import ( "gopkg.in/yaml.v3" ) +// ClusterInfo holds cluster metadata for krkn-ai analysis. +type ClusterInfo struct { + ID string `json:"id,omitempty" yaml:"id,omitempty"` + Version string `json:"version,omitempty" yaml:"version,omitempty"` + Type string `json:"type,omitempty" yaml:"type,omitempty"` // Combined: "cloud/platform[-hcp]", e.g. "aws/rosa-hcp" + Region string `json:"region,omitempty" yaml:"region,omitempty"` + Environment string `json:"environment,omitempty" yaml:"environment,omitempty"` // e.g. "stage", "production", "integration" +} + const ( // Default file paths relative to results directory allCSVPath = "reports/all.csv" @@ -29,6 +38,7 @@ const ( type KrknAIAggregator struct { logger logr.Logger topScenariosCount int + clusterInfo *ClusterInfo } // KrknAIData holds aggregated krkn-ai results with minimal context. @@ -39,6 +49,7 @@ type KrknAIData struct { HealthCheckReport []HealthCheckResult `json:"healthCheckReport"` LogArtifacts []internalAggregator.LogEntry `json:"logArtifacts"` ConfigSummary string `json:"configSummary,omitempty"` + ClusterInfo *ClusterInfo `json:"clusterInfo,omitempty"` } // KrknAISummary provides high-level statistics about the chaos test run. @@ -89,6 +100,16 @@ func (a *KrknAIAggregator) WithTopScenariosCount(count int) *KrknAIAggregator { return a } +// WithClusterInfo sets cluster metadata to include in collected data. +// A defensive copy is stored so later mutations by the caller don't affect stored data. +func (a *KrknAIAggregator) WithClusterInfo(info *ClusterInfo) *KrknAIAggregator { + if info != nil { + cp := *info + a.clusterInfo = &cp + } + return a +} + // Collect gathers krkn-ai results from the specified directory. func (a *KrknAIAggregator) Collect(ctx context.Context, resultsDir string) (*KrknAIData, error) { a.logger.Info("collecting krkn-ai results", "resultsDir", resultsDir) @@ -98,6 +119,10 @@ func (a *KrknAIAggregator) Collect(ctx context.Context, resultsDir string) (*Krk } data := &KrknAIData{} + if a.clusterInfo != nil { + cp := *a.clusterInfo + data.ClusterInfo = &cp + } var collectionErrors []string // Collect scenario results from all.csv diff --git a/pkg/krknai/aggregator/aggregator_test.go b/pkg/krknai/aggregator/aggregator_test.go index e6509ad02b..5f88b90452 100644 --- a/pkg/krknai/aggregator/aggregator_test.go +++ b/pkg/krknai/aggregator/aggregator_test.go @@ -313,3 +313,44 @@ scenario: require.NoError(t, os.WriteFile(filepath.Join(resultsDir, "krkn-ai.yaml"), []byte(configYAML), 0o644)) } + +func TestWithClusterInfo_DefensiveCopy(t *testing.T) { + info := &ClusterInfo{ + ID: "original-id", + Version: "4.14.0", + Type: "aws/rosa-hcp", + Region: "us-east-1", + Environment: "stage", + } + + agg := NewKrknAIAggregator(context.Background()) + agg.WithClusterInfo(info) + + // Mutate the caller's struct after passing it in + info.ID = "mutated-id" + info.Region = "eu-west-1" + + assert.Equal(t, "original-id", agg.clusterInfo.ID, "stored copy must be isolated from caller mutation") + assert.Equal(t, "us-east-1", agg.clusterInfo.Region, "stored copy must be isolated from caller mutation") +} + +func TestCollect_ClusterInfoIsolation(t *testing.T) { + tempDir := t.TempDir() + resultsDir := filepath.Join(tempDir, "results") + reportsDir := filepath.Join(resultsDir, "reports") + require.NoError(t, os.MkdirAll(reportsDir, 0o755)) + createKrknAITestFiles(t, resultsDir, reportsDir) + + info := &ClusterInfo{ID: "test-cluster", Version: "4.14.0"} + agg := NewKrknAIAggregator(context.Background()) + agg.WithClusterInfo(info) + + data, err := agg.Collect(context.Background(), resultsDir) + require.NoError(t, err) + require.NotNil(t, data.ClusterInfo) + + // The output ClusterInfo should be a separate copy from the aggregator's internal one + assert.Equal(t, "test-cluster", data.ClusterInfo.ID) + data.ClusterInfo.ID = "mutated-output" + assert.Equal(t, "test-cluster", agg.clusterInfo.ID, "aggregator's stored copy must not be affected by output mutation") +} diff --git a/pkg/krknai/analysisengine/engine.go b/pkg/krknai/analysisengine/engine.go index b80afa802e..bd6fe8eeef 100644 --- a/pkg/krknai/analysisengine/engine.go +++ b/pkg/krknai/analysisengine/engine.go @@ -1,14 +1,20 @@ package analysisengine import ( + "bytes" "context" "embed" "fmt" + "html/template" "io/fs" "os" "path/filepath" "time" + "github.com/gomarkdown/markdown" + mdhtml "github.com/gomarkdown/markdown/html" + "github.com/gomarkdown/markdown/parser" + "github.com/microcosm-cc/bluemonday" "github.com/openshift/osde2e/internal/analysisengine" "github.com/openshift/osde2e/internal/llm" "github.com/openshift/osde2e/internal/llm/tools" @@ -18,21 +24,22 @@ import ( "gopkg.in/yaml.v3" ) -//go:embed prompts/krknai.yaml -var krknaiTemplatesFS embed.FS +//go:embed prompts/* +var krknPrompts embed.FS const ( analysisDirName = "llm-analysis" summaryFileName = "summary.yaml" - // krknAIPromptTemplate is the prompt template ID for krkn-ai analysis. krknAIPromptTemplate = "krknai" + htmlTemplatePath = "prompts/report.html" ) // Config holds configuration for the krkn-ai analysis engine. type Config struct { analysisengine.BaseConfig - TopScenariosCount int // Number of top scenarios to include (default: 10) + TopScenariosCount int // Number of top scenarios to include (default: 10) + ReportFormat string // "json" (default), "markdown", or "html" } // Engine analyzes krkn-ai chaos test results using LLM. @@ -60,14 +67,17 @@ func New(ctx context.Context, config *Config) (*Engine, error) { agg.WithTopScenariosCount(config.TopScenariosCount) } - templatesFS, err := fs.Sub(krknaiTemplatesFS, "prompts") + promptStore, err := prompts.NewPromptStore(prompts.DefaultTemplates()) if err != nil { - return nil, fmt.Errorf("failed to access embedded prompts: %w", err) + return nil, fmt.Errorf("failed to initialize prompt store: %w", err) } - promptStore, err := prompts.NewPromptStore(templatesFS) + localFS, err := fs.Sub(krknPrompts, "prompts") if err != nil { - return nil, fmt.Errorf("failed to initialize prompt store: %w", err) + return nil, fmt.Errorf("failed to load krkn-ai prompt templates: %w", err) + } + if err := promptStore.RegisterTemplates(localFS); err != nil { + return nil, fmt.Errorf("failed to register krkn-ai prompt templates: %w", err) } client, err := llm.NewGeminiClient(ctx, config.APIKey) @@ -88,6 +98,12 @@ func New(ctx context.Context, config *Config) (*Engine, error) { }, nil } +// WithClusterInfo sets cluster metadata on the aggregator for inclusion in collected data. +func (e *Engine) WithClusterInfo(info *krknAggregator.ClusterInfo) *Engine { + e.aggregator.WithClusterInfo(info) + return e +} + // Run executes the krkn-ai analysis workflow. func (e *Engine) Run(ctx context.Context) (*analysisengine.Result, error) { // Collect krkn-ai results @@ -99,7 +115,7 @@ func (e *Engine) Run(ctx context.Context) (*analysisengine.Result, error) { // Create tool registry with log artifacts for read_file tool toolRegistry := tools.NewRegistry(data.LogArtifacts) - // Prepare template variables + // Prepare template variables from collected data vars := map[string]any{ "Summary": data.Summary, "TopScenarios": data.TopScenarios, @@ -108,6 +124,9 @@ func (e *Engine) Run(ctx context.Context) (*analysisengine.Result, error) { "LogArtifacts": data.LogArtifacts, "ConfigSummary": data.ConfigSummary, } + if data.ClusterInfo != nil { + vars["ClusterInfo"] = data.ClusterInfo + } // Render prompt using prompt store userPrompt, llmConfig, err := e.promptStore.RenderPrompt(krknAIPromptTemplate, vars) @@ -134,10 +153,19 @@ func (e *Engine) Run(ctx context.Context) (*analysisengine.Result, error) { return nil, fmt.Errorf("LLM analysis failed: %w", err) } + content := result.Content + if e.config.ReportFormat == "html" { + var err error + content, err = markdownToHTML(content) + if err != nil { + return nil, fmt.Errorf("failed to convert markdown to HTML: %w", err) + } + } + // Build analysis result analysisResult := &analysisengine.Result{ Status: "completed", - Content: result.Content, + Content: content, Prompt: userPrompt, Metadata: map[string]any{ "analysis_type": "krknai", @@ -181,6 +209,7 @@ func (e *Engine) writeSummary(result *analysisengine.Result, data *krknAggregato summary := map[string]any{ "timestamp": time.Now().Format(time.RFC3339), "analysis_type": "krknai", + "cluster_info": data.ClusterInfo, "run_summary": map[string]any{ "total_scenarios": data.Summary.TotalScenarioCount, "successful_scenarios": data.Summary.SuccessfulScenarioCount, @@ -212,6 +241,30 @@ func (e *Engine) writeSummary(result *analysisengine.Result, data *krknAggregato return nil } +func markdownToHTML(content string) (string, error) { + htmlTmplBytes, err := krknPrompts.ReadFile(htmlTemplatePath) + if err != nil { + return "", fmt.Errorf("failed to read HTML template: %w", err) + } + + tmpl, err := template.New("report").Parse(string(htmlTmplBytes)) + if err != nil { + return "", fmt.Errorf("failed to parse HTML template: %w", err) + } + + p := parser.NewWithExtensions(parser.CommonExtensions | parser.AutoHeadingIDs) + renderer := mdhtml.NewRenderer(mdhtml.RendererOptions{Flags: mdhtml.CommonFlags | mdhtml.HrefTargetBlank}) + unsafeBody := markdown.ToHTML([]byte(content), p, renderer) + safeBody := bluemonday.UGCPolicy().SanitizeBytes(unsafeBody) + + var buf bytes.Buffer + if err := tmpl.Execute(&buf, struct{ Body template.HTML }{Body: template.HTML(string(safeBody))}); err != nil { + return "", fmt.Errorf("failed to execute HTML template: %w", err) + } + + return buf.String(), nil +} + // sendNotifications sends analysis results to configured reporters. func (e *Engine) sendNotifications(ctx context.Context, result *analysisengine.Result) { reporterResult := &reporter.AnalysisResult{ diff --git a/pkg/krknai/analysisengine/engine_test.go b/pkg/krknai/analysisengine/engine_test.go index 102c3ce998..06cd98f495 100644 --- a/pkg/krknai/analysisengine/engine_test.go +++ b/pkg/krknai/analysisengine/engine_test.go @@ -12,7 +12,7 @@ import ( "github.com/openshift/osde2e/internal/llm/tools" "github.com/openshift/osde2e/internal/prompts" "github.com/openshift/osde2e/internal/reporter" - krknAggregator "github.com/openshift/osde2e/pkg/krknai/aggregator" + krknAgg "github.com/openshift/osde2e/pkg/krknai/aggregator" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gopkg.in/yaml.v3" @@ -50,13 +50,175 @@ func TestNew_ValidConfig(t *testing.T) { assert.Contains(t, err.Error(), "results directory is required") } -func TestEmbeddedPromptTemplate(t *testing.T) { - // Verify the embedded prompt template loads correctly - data, err := krknaiTemplatesFS.ReadFile("prompts/krknai.yaml") +func TestPromptTemplatesAvailable(t *testing.T) { + store := newTestPromptStore(t) + + tmpl, err := store.GetTemplate("krknai") require.NoError(t, err) - assert.Contains(t, string(data), "system_prompt") - assert.Contains(t, string(data), "user_prompt") - assert.Contains(t, string(data), "chaos engineering") + assert.Contains(t, tmpl.SystemPrompt, "chaos engineering") + assert.Contains(t, tmpl.UserPrompt, "Summary") + + assert.Contains(t, tmpl.SystemPrompt, "markdown") + assert.Contains(t, tmpl.SystemPrompt, "genetic algorithm") +} + +func TestRenderKrknAIPrompt(t *testing.T) { + store := newTestPromptStore(t) + + variables := map[string]any{ + "ClusterInfo": &krknAgg.ClusterInfo{ + ID: "abc-123", + Version: "4.17.3", + Type: "aws/rosa-hcp", + Region: "us-east-1", + Environment: "stage", + }, + "Summary": map[string]any{ + "TotalScenarioCount": 30, + "SuccessfulScenarioCount": 27, + "FailedScenarioCount": 3, + "Generations": 3, + "MaxFitnessScore": 8.75, + "AvgFitnessScore": 4.32, + "ScenarioTypes": []string{"node-cpu-hog", "node-memory-hog", "pod-scenarios"}, + }, + "TopScenarios": []map[string]any{ + { + "Scenario": "node-cpu-hog", + "GenerationID": 2, + "ScenarioID": 15, + "FitnessScore": 8.75, + "HealthCheckResponseTimeScore": 6.50, + "HealthCheckFailureScore": 2.25, + "KrknFailureScore": 0.0, + "Parameters": "node_selector: node-role.kubernetes.io/worker", + }, + }, + "FailedScenarios": []map[string]any{ + { + "Scenario": "dns-outage", + "GenerationID": 1, + "ScenarioID": 7, + "KrknFailureScore": -1.0, + "Parameters": "namespace: openshift-dns", + }, + }, + "HealthCheckReport": []map[string]any{ + { + "ScenarioID": 15, + "ComponentName": "console", + "MinResponseTime": 12.5, + "MaxResponseTime": 850.3, + "AverageResponseTime": 245.7, + "SuccessCount": 48, + "FailureCount": 2, + }, + }, + "LogArtifacts": []map[string]any{ + {"Source": "/results/reports/all.csv", "LineCount": 31}, + {"Source": "/results/krkn-ai.yaml", "LineCount": 85}, + }, + "ConfigSummary": "generations: 3\npopulation_size: 10\n", + } + + userPrompt, config, err := store.RenderPrompt("krknai", variables) + require.NoError(t, err) + require.NotNil(t, config) + + assert.Contains(t, userPrompt, "id=abc-123") + assert.Contains(t, userPrompt, "version=4.17.3") + assert.Contains(t, userPrompt, "type=aws/rosa-hcp") + assert.Contains(t, userPrompt, "region=us-east-1") + assert.Contains(t, userPrompt, "env=stage") + + assert.Contains(t, userPrompt, "30 scenarios") + assert.Contains(t, userPrompt, "27 ok") + assert.Contains(t, userPrompt, "3 failed") + assert.Contains(t, userPrompt, "max=8.75") + assert.Contains(t, userPrompt, "fitness=8.75") + assert.Contains(t, userPrompt, "node_selector: node-role.kubernetes.io/worker") + assert.Contains(t, userPrompt, "dns-outage") + assert.Contains(t, userPrompt, "console") + assert.Contains(t, userPrompt, "avg=245.70ms") + assert.Contains(t, userPrompt, "/results/reports/all.csv (31L)") + assert.Contains(t, userPrompt, "generations: 3") + + assert.NotNil(t, config.SystemInstruction) + assert.Contains(t, *config.SystemInstruction, "chaos engineering analyst") + assert.Contains(t, *config.SystemInstruction, "genetic algorithm") +} + +func TestRun_MarkdownReportFormat(t *testing.T) { + tempDir := t.TempDir() + reportsDir := filepath.Join(tempDir, "reports") + require.NoError(t, os.MkdirAll(reportsDir, 0o755)) + + createTestResultFiles(t, tempDir, reportsDir) + + ctx := context.Background() + agg := krknAgg.NewKrknAIAggregator(ctx) + promptStore := newTestPromptStore(t) + + mockClient := &mockLLMClient{ + response: &llm.AnalysisResult{ + Content: "# Krkn-AI Chaos Test Report\n\n## Executive Summary\nCluster shows moderate resilience.", + }, + } + + engine := &Engine{ + config: &Config{ + BaseConfig: analysisengine.BaseConfig{ArtifactsDir: tempDir, APIKey: "fake-key"}, + ReportFormat: "markdown", + }, + aggregator: agg, + promptStore: promptStore, + llmClient: mockClient, + reporterRegistry: newTestReporterRegistry(), + } + + result, err := engine.Run(ctx) + require.NoError(t, err) + require.NotNil(t, result) + assert.Contains(t, result.Content, "Chaos Test Report") +} + +func TestRun_HTMLReportFormat(t *testing.T) { + tempDir := t.TempDir() + reportsDir := filepath.Join(tempDir, "reports") + require.NoError(t, os.MkdirAll(reportsDir, 0o755)) + + createTestResultFiles(t, tempDir, reportsDir) + + ctx := context.Background() + agg := krknAgg.NewKrknAIAggregator(ctx) + promptStore := newTestPromptStore(t) + + mockClient := &mockLLMClient{ + response: &llm.AnalysisResult{ + Content: "# Krkn-AI Chaos Test Report\n\n## Executive Summary\nCluster shows **moderate** resilience.\n\n| Metric | Value |\n|--------|-------|\n| Total | 5 |\n", + }, + } + + engine := &Engine{ + config: &Config{ + BaseConfig: analysisengine.BaseConfig{ArtifactsDir: tempDir, APIKey: "fake-key"}, + ReportFormat: "html", + }, + aggregator: agg, + promptStore: promptStore, + llmClient: mockClient, + reporterRegistry: newTestReporterRegistry(), + } + + result, err := engine.Run(ctx) + require.NoError(t, err) + require.NotNil(t, result) + + assert.Contains(t, result.Content, "") + assert.Contains(t, result.Content, "") + assert.Contains(t, result.Content, "moderate") + assert.NotContains(t, result.Content, "## Executive Summary") } func TestWriteSummary(t *testing.T) { @@ -78,8 +240,8 @@ func TestWriteSummary(t *testing.T) { }, } - data := &krknAggregator.KrknAIData{ - Summary: krknAggregator.KrknAISummary{ + data := &krknAgg.KrknAIData{ + Summary: krknAgg.KrknAISummary{ TotalScenarioCount: 5, SuccessfulScenarioCount: 4, FailedScenarioCount: 1, @@ -88,10 +250,10 @@ func TestWriteSummary(t *testing.T) { AvgFitnessScore: 1.8, ScenarioTypes: []string{"node-cpu-hog", "pod-scenarios"}, }, - TopScenarios: []krknAggregator.ScenarioResult{ + TopScenarios: []krknAgg.ScenarioResult{ {ScenarioID: 1, Scenario: "node-cpu-hog", FitnessScore: 2.2}, }, - FailedScenarios: []krknAggregator.ScenarioResult{ + FailedScenarios: []krknAgg.ScenarioResult{ {ScenarioID: 5, Scenario: "dns-outage", KrknFailureScore: -1.0}, }, } @@ -133,7 +295,7 @@ func TestRun_WithMockLLM(t *testing.T) { ctx := context.Background() // Build engine with mock LLM client - agg := krknAggregator.NewKrknAIAggregator(ctx) + agg := krknAgg.NewKrknAIAggregator(ctx) promptStore := newTestPromptStore(t) @@ -182,7 +344,7 @@ func TestRun_LLMFailure(t *testing.T) { createTestResultFiles(t, tempDir, reportsDir) ctx := context.Background() - agg := krknAggregator.NewKrknAIAggregator(ctx) + agg := krknAgg.NewKrknAIAggregator(ctx) promptStore := newTestPromptStore(t) mockClient := &mockLLMClient{ @@ -206,7 +368,7 @@ func TestRun_LLMFailure(t *testing.T) { func TestRun_MissingResults(t *testing.T) { ctx := context.Background() - agg := krknAggregator.NewKrknAIAggregator(ctx) + agg := krknAgg.NewKrknAIAggregator(ctx) promptStore := newTestPromptStore(t) engine := &Engine{ @@ -224,13 +386,16 @@ func TestRun_MissingResults(t *testing.T) { assert.Contains(t, err.Error(), "failed to collect krkn-ai results") } -// newTestPromptStore creates a prompt store using the embedded krkn-ai templates. +// newTestPromptStore creates a prompt store using the central prompt templates. func newTestPromptStore(t *testing.T) *prompts.PromptStore { t.Helper() - templatesFS, err := fs.Sub(krknaiTemplatesFS, "prompts") + store, err := prompts.NewPromptStore(prompts.DefaultTemplates()) require.NoError(t, err) - store, err := prompts.NewPromptStore(templatesFS) + + localFS, err := fs.Sub(krknPrompts, "prompts") require.NoError(t, err) + require.NoError(t, store.RegisterTemplates(localFS)) + return store } diff --git a/pkg/krknai/analysisengine/prompts/krknai.yaml b/pkg/krknai/analysisengine/prompts/krknai.yaml index d22557a5aa..b7b79c7dac 100644 --- a/pkg/krknai/analysisengine/prompts/krknai.yaml +++ b/pkg/krknai/analysisengine/prompts/krknai.yaml @@ -1,94 +1,93 @@ system_prompt: | - You are an expert chaos engineering analyst reviewing Krkn-AI chaos test results on an OpenShift cluster. + Expert chaos engineering analyst for Krkn-AI results on OpenShift. + Ref: https://krkn-chaos.dev/docs/krkn_ai/ - Your task is to: - 1. Identify the most impactful chaos scenarios (highest fitness scores indicate more system disruption) - 2. Identify cluster vulnerabilities and weak points exposed by chaos tests - 3. Provide actionable resilience recommendations + Krkn-AI evolves chaos scenarios via genetic algorithm. The SLO fitness function combines health check failures + latency deviation as genetic algorithm feedback. Higher fitness = more system disruption = test objective achieved. - You have access to the read_file tool for examining specific scenario details: - - Use read_file tool: {"path": "file_path"} to read scenario YAML files or logs - - Use read_file with range: {"path": "file_path", "start": 10, "stop": 50} to read specific line ranges + Metrics: fitness_score=overall impact (higher=worse), health_check_failure_score=app failures (0=healthy), health_check_response_time_score=latency deviation, krkn_failure_score=-1.0 means scenario failed to execute (infra issue, not vulnerability). - Understanding the metrics: - - fitness_score: Overall impact score (higher = more impactful chaos, system showed more degradation) - - health_check_failure_score: Application failures during chaos (0 = no failures) - - health_check_response_time_score: Response time degradation (higher = slower responses) - - krkn_failure_score: -1.0 indicates the chaos scenario itself failed to execute + Tool: read_file({"files":[{"path":"file_path"}]}) or with range: {"files":[{"path":"p","start":10,"stop":50}]}. Only use paths from the artifacts list. - Common chaos scenario types: - - node-cpu-hog: CPU stress on nodes - - node-memory-hog: Memory pressure on nodes - - node-io-hog: I/O stress on nodes - - pod-scenarios: Pod disruption/deletion - - container-scenarios: Container-level chaos - - dns-outage: DNS disruption + Node role identification: scenario logs contain "node_taints" mapping hostnames to roles (node-role.kubernetes.io/master, /infra, /worker) and "node_summary_infos" with nodes_type. Cross-reference the node-selector hostname in scenario parameters with node_taints to determine the targeted node role (master/infra/worker). Always report the node role for node-targeting scenarios (cpu-hog, memory-hog, io-hog, node-scenarios). - Focus on: - - Which node types (master, infra, worker) are most vulnerable - - Which chaos types cause the most disruption - - Patterns in failed scenarios - - Recovery characteristics + Output a markdown report with these sections: + # Krkn-AI Chaos Test Report + ## Executive Summary (2-3 sentences) + ## Cluster Under Test (ID, version, type, region, environment) + ## Test Configuration (GA params; list all enabled chaos scenarios; health check targets with name, endpoint URL, and expected status code — extract expected_status_code from the krkn-ai.yaml artifact via read_file, never guess or infer it) + ## Run Statistics (table: totals, generations, fitness scores, types) + ## Genetic Algorithm Evolution (fitness trends, convergence, most disruptive generation) + ## Top Vulnerabilities (top 3-5 by fitness: target node role + hostname, impact, severity [Critical/High/Medium/Low], why it matters) + ## Failed Scenarios Analysis (if any) + ## Health Check Analysis (response time and failure patterns) + ## Cluster Resilience Assessment (rate CPU/Memory/IO/Pod/DNS: Strong/Moderate/Weak) + ## Recommendations (numbered, actionable, prioritized) + ## Appendix: Scenario Details (table: generation, ID, type, fitness, status, target node role) - Respond with valid JSON matching this schema: - { - "cluster_resilience_assessment": "Brief overall assessment of cluster resilience", - "top_vulnerabilities": [ - { - "component": "Affected component or node type", - "chaos_type": "Type of chaos that exposed this", - "impact": "Description of the impact observed", - "severity": "high|medium|low" - } - ], - "recommendations": [ - "Specific actionable recommendation 1", - "Specific actionable recommendation 2" - ], - "failed_scenarios_analysis": "Brief analysis of why scenarios failed, if any" - } + Output raw markdown only. user_prompt: | - Analyze these Krkn-AI chaos test results: + Analyze and report: + {{- if .ClusterInfo}} - **Run Summary:** - - Total Scenarios: {{.Summary.TotalScenarioCount}} - - Successful: {{.Summary.SuccessfulScenarioCount}} - - Failed: {{.Summary.FailedScenarioCount}} - - Generations: {{.Summary.Generations}} - - Max Fitness Score: {{printf "%.2f" .Summary.MaxFitnessScore}} - - Avg Fitness Score: {{printf "%.2f" .Summary.AvgFitnessScore}} - - Scenario Types Tested: {{range $i, $t := .Summary.ScenarioTypes}}{{if $i}}, {{end}}{{$t}}{{end}} + Cluster: id={{.ClusterInfo.ID}} version={{.ClusterInfo.Version}} type={{.ClusterInfo.Type}} region={{.ClusterInfo.Region}} env={{.ClusterInfo.Environment}} + {{- end}} + + Run: {{.Summary.TotalScenarioCount}} scenarios ({{.Summary.SuccessfulScenarioCount}} ok, {{.Summary.FailedScenarioCount}} failed), {{.Summary.Generations}} generations, fitness max={{printf "%.2f" .Summary.MaxFitnessScore}} avg={{printf "%.2f" .Summary.AvgFitnessScore}}, types: {{range $i, $t := .Summary.ScenarioTypes}}{{if $i}},{{end}}{{$t}}{{end}} - **Top {{len .TopScenarios}} Most Impactful Scenarios (by fitness score):** + Top scenarios: {{range .TopScenarios -}} - - **{{.Scenario}}** (Gen {{.GenerationID}}, ID {{.ScenarioID}}) - Fitness: {{printf "%.2f" .FitnessScore}} - Response Time Score: {{printf "%.2f" .HealthCheckResponseTimeScore}} | Failure Score: {{printf "%.2f" .HealthCheckFailureScore}} - Parameters: {{.Parameters}} + - {{.Scenario}} gen={{.GenerationID}} id={{.ScenarioID}} fitness={{printf "%.2f" .FitnessScore}} rt={{printf "%.2f" .HealthCheckResponseTimeScore}} fail={{printf "%.2f" .HealthCheckFailureScore}} krkn={{printf "%.2f" .KrknFailureScore}} params={{.Parameters}} {{end}} - {{if .FailedScenarios -}} - **Failed Scenarios ({{len .FailedScenarios}} total):** + {{- if .FailedScenarios -}} + Failed: {{range .FailedScenarios -}} - - {{.Scenario}} (ID {{.ScenarioID}}): {{.Parameters}} + - {{.Scenario}} gen={{.GenerationID}} id={{.ScenarioID}} krkn={{printf "%.2f" .KrknFailureScore}} params={{.Parameters}} {{end}} {{- end}} - {{if .ConfigSummary -}} - **Configuration Summary:** - ```yaml + {{- if .HealthCheckReport -}} + Health checks: + {{range .HealthCheckReport -}} + - id={{.ScenarioID}} {{.ComponentName}} avg={{printf "%.2f" .AverageResponseTime}}ms min={{printf "%.2f" .MinResponseTime}} max={{printf "%.2f" .MaxResponseTime}} ok={{.SuccessCount}} fail={{.FailureCount}} + {{end}} + {{- end}} + {{- if .ConfigSummary -}} + Config: {{.ConfigSummary}} - ``` {{- end}} - **Available Artifacts for Deep Dive:** + Artifacts: {{range .LogArtifacts -}} - - {{.Source}}{{if gt .LineCount 0}} ({{.LineCount}} lines){{- end}} + - {{.Source}}{{if gt .LineCount 0}} ({{.LineCount}}L){{- end}} {{end}} + Use read_file on relevant artifacts. For health check targets, you MUST read the krkn-ai.yaml artifact to extract the expected status code — never assume or fabricate this value. Generate the full markdown report per system prompt structure. - **Instructions:** - 1. Analyze the top impactful scenarios to identify which chaos types and node selectors cause the most disruption - 2. Look for patterns in the parameters (node selectors, taints) that correlate with high fitness scores - 3. If there are failed scenarios, analyze why they might have failed - 4. Use read_file tool to examine specific scenario YAML files if you need more details about health check timeseries - 5. Provide actionable recommendations to improve cluster resilience - - Provide your analysis as JSON. +variables: + - name: "ClusterInfo" + type: "object" + description: "ClusterInfo: ID, Version, Type (cloud/platform[-hcp]), Region, Environment" + required: false + - name: "Summary" + type: "object" + description: "KrknAISummary" + required: true + - name: "TopScenarios" + type: "array" + description: "[]ScenarioResult sorted by fitness desc" + required: true + - name: "FailedScenarios" + type: "array" + description: "[]ScenarioResult where KrknFailureScore=-1.0" + required: false + - name: "HealthCheckReport" + type: "array" + description: "[]HealthCheckResult per-scenario metrics" + required: false + - name: "LogArtifacts" + type: "array" + description: "[]LogEntry for read_file tool" + required: true + - name: "ConfigSummary" + type: "string" + description: "Formatted krkn-ai.yaml config" + required: false diff --git a/pkg/krknai/analysisengine/prompts/report.html b/pkg/krknai/analysisengine/prompts/report.html new file mode 100644 index 0000000000..60415900e4 --- /dev/null +++ b/pkg/krknai/analysisengine/prompts/report.html @@ -0,0 +1,26 @@ + + + + + + +Krkn-AI Chaos Test Report + + + +{{.Body}} + +