Skip to content

Commit 794c6ad

Browse files
committed
SREP-3106: Add CAD run command for scheduling manual CAD investigations
1 parent 814c7f0 commit 794c6ad

File tree

9 files changed

+614
-0
lines changed

9 files changed

+614
-0
lines changed

cmd/cluster/cad/README.md

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# CAD (Configuration Anomaly Detection) Commands
2+
3+
Commands for running manual investigations on the Configuration Anomaly Detection (CAD) clusters and writing the results to a backplane report.
4+
5+
## Prerequisites
6+
7+
- Connected to environment of the target cluster: `ocm login --use-auth-code --url "<production|stage>"`
8+
- The CAD clusters (both stage and prod) are always in production OCM
9+
10+
## Usage
11+
12+
```bash
13+
osdctl cluster cad run \
14+
--cluster-id <cluster-id> \
15+
--investigation <investigation-name> \
16+
--environment <stage|production> \
17+
--reason "<JIRA-ticket or reason>"
18+
```
19+
20+
### Flags
21+
22+
- `--cluster-id` / `-C`: Target cluster ID (internal or external)
23+
- `--investigation` / `-i`: Investigation to run (see available investigations below)
24+
- `--environment` / `-e`: Target cluster environment (`stage` or `production`). This is kept explicit, because the pipeline will silently fail if this parameter isn't correct
25+
- `--reason`: Elevation reason for backplane access (e.g., `OHSS-1234` or `#ITN-2024-12345`)
26+
27+
### Available Investigations
28+
29+
- `chgm` - Change Management
30+
- `cmbb` - Configuration Management Baseline Check
31+
- `can-not-retrieve-updates` - Update Retrieval Issues
32+
- `ai` - AI-based Analysis
33+
- `cpd` - Control Plane Degradation
34+
- `etcd-quota-low` - ETCD Quota Issues
35+
- `insightsoperatordown` - Insights Operator Down
36+
- `machine-health-check` - Machine Health Check
37+
- `must-gather` - Must-Gather Collection
38+
- `upgrade-config` - Upgrade Configuration Check
39+
40+
### Example
41+
42+
```bash
43+
osdctl cluster cad run \
44+
--cluster-id 1a2b3c4d5e6f7g8h9i0j \
45+
--investigation chgm \
46+
--environment production \
47+
--reason "OHSS-12345"
48+
```
49+
50+
## Debugging
51+
52+
To check the status of a PipelineRun after scheduling:
53+
54+
**1. Connect to production OCM**
55+
```bash
56+
ocm login --use-auth-code --url "production"
57+
```
58+
59+
**2. Login to the CAD cluster**
60+
- For stage: `ocm backplane login cads01ue1`
61+
- For prod: `ocm backplane login cadp01ue1`
62+
63+
**3. Check PipelineRuns**
64+
- For stage:
65+
```bash
66+
ocm backplane elevate -n -- get pipelinerun -n configuration-anomaly-detection-stage
67+
```
68+
- For prod:
69+
```bash
70+
ocm backplane elevate -n -- get pipelinerun -n configuration-anomaly-detection-production
71+
```
72+
73+
## Architecture Notes
74+
75+
- **CAD Cluster IDs**: Hardcoded in app-interface
76+
- Stage: `2f9ghpikkv446iidcv7b92em2hgk13q9` (cads01ue1)
77+
- Prod: `2fbi9mjhqpobh20ot5d7e5eeq3a8gfhs` (cadp01ue1)
78+
- **Namespaces**:
79+
- Stage: `configuration-anomaly-detection-stage`
80+
- Prod: `configuration-anomaly-detection-production`
81+
- **Pipeline**: `cad-manual-investigation-pipeline` (Tekton)
82+
- The command always connects to production OCM internally, regardless of user's current OCM context
83+
84+
## Viewing Reports
85+
86+
After the investigation completes (may take several minutes), view reports using:
87+
88+
```bash
89+
osdctl cluster reports list -C <cluster-id> -l 1
90+
```
91+
92+
**Note**: You need to be connected to the correct OCM environment for the target cluster to view its reports.

cmd/cluster/cad/cad.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package cad
2+
3+
import (
4+
"github.com/spf13/cobra"
5+
)
6+
7+
func NewCmdCad() *cobra.Command {
8+
cadCmd := &cobra.Command{
9+
Use: "cad",
10+
Short: "Provides commands to run CAD tasks",
11+
Args: cobra.NoArgs,
12+
DisableAutoGenTag: true,
13+
}
14+
15+
cadCmd.AddCommand(newCmdRun())
16+
return cadCmd
17+
}

cmd/cluster/cad/run.go

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
package cad
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"github.com/openshift/osdctl/pkg/k8s"
8+
"github.com/openshift/osdctl/pkg/utils"
9+
"github.com/spf13/cobra"
10+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
11+
"k8s.io/apimachinery/pkg/runtime/schema"
12+
"sigs.k8s.io/controller-runtime/pkg/client"
13+
)
14+
15+
const (
16+
cadClusterIDProd = "2fbi9mjhqpobh20ot5d7e5eeq3a8gfhs" // These IDs are hard-coded in app-interface
17+
cadClusterIDStage = "2f9ghpikkv446iidcv7b92em2hgk13q9"
18+
)
19+
20+
var validInvestigations = []string{
21+
"chgm",
22+
"cmbb",
23+
"can-not-retrieve-updates",
24+
"ai",
25+
"cpd",
26+
"etcd-quota-low",
27+
"insightsoperatordown",
28+
"machine-health-check",
29+
"must-gather",
30+
"upgrade-config",
31+
}
32+
33+
var validEnvironments = []string{
34+
"stage",
35+
"production",
36+
}
37+
38+
type cadRunOptions struct {
39+
clusterID string
40+
investigation string
41+
elevationReason string
42+
environment string
43+
}
44+
45+
func newCmdRun() *cobra.Command {
46+
opts := &cadRunOptions{}
47+
48+
runCmd := &cobra.Command{
49+
Use: "run",
50+
Short: "Run a manual investigation on the CAD cluster",
51+
Long: `Run a manual investigation on the Configuration Anomaly Detection (CAD) cluster.
52+
53+
This command schedules a Tekton PipelineRun on the appropriate CAD cluster (stage or production)
54+
to run an investigation against a target cluster.
55+
56+
Prerequisites:
57+
- Connected to the target cluster's OCM environment (production or stage)
58+
- The CAD clusters themselves are always in production OCM
59+
60+
Available Investigations:
61+
chgm, cmbb, can-not-retrieve-updates, ai, cpd, etcd-quota-low,
62+
insightsoperatordown, machine-health-check, must-gather, upgrade-config
63+
64+
Example:
65+
# Run a change management investigation on a production cluster
66+
osdctl cluster cad run \
67+
--cluster-id 1a2b3c4d5e6f7g8h9i0j \
68+
--investigation chgm \
69+
--environment production \
70+
--reason "OHSS-12345"
71+
72+
Note:
73+
After the investigation completes (may take several minutes), view results using:
74+
osdctl cluster reports list -C <cluster-id> -l 1
75+
76+
You must be connected to the target cluster's OCM environment to view its reports.`,
77+
Args: cobra.NoArgs,
78+
DisableAutoGenTag: true,
79+
RunE: func(cmd *cobra.Command, args []string) error {
80+
return opts.run()
81+
},
82+
}
83+
84+
runCmd.Flags().StringVarP(&opts.clusterID, "cluster-id", "C", "", "Cluster ID (internal or external)")
85+
runCmd.Flags().StringVarP(&opts.investigation, "investigation", "i", "", "Investigation name")
86+
runCmd.Flags().StringVarP(&opts.environment, "environment", "e", "", "Environment of the cluster we want to run the investigation on. Allowed values: \"stage\" or \"production\"")
87+
runCmd.Flags().StringVar(&opts.elevationReason, "reason", "", "Provide a reason for running a manual investigation, used for backplane. Eg: 'OHSS-XXXX', or '#ITN-2024-XXXXX.")
88+
89+
_ = runCmd.RegisterFlagCompletionFunc("investigation", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
90+
return validInvestigations, cobra.ShellCompDirectiveNoFileComp
91+
})
92+
93+
_ = runCmd.RegisterFlagCompletionFunc("environment", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
94+
return validEnvironments, cobra.ShellCompDirectiveNoFileComp
95+
})
96+
97+
return runCmd
98+
}
99+
100+
func (o *cadRunOptions) run() error {
101+
if err := o.validate(); err != nil {
102+
return err
103+
}
104+
105+
cadClusterID, cadNamespace := o.getCADClusterConfig()
106+
107+
// CAD clusters are always in production OCM, so explicitly create a production connection
108+
ocmConn, err := utils.CreateConnectionWithUrl("production")
109+
if err != nil {
110+
return fmt.Errorf("failed to create production OCM connection: %w", err)
111+
}
112+
defer ocmConn.Close()
113+
114+
k8sClient, err := k8s.NewAsBackplaneClusterAdminWithConn(cadClusterID, client.Options{}, ocmConn, o.elevationReason, "Need elevation for cad cluster in order to schedule a Tekton pipeline run")
115+
if err != nil {
116+
return fmt.Errorf("failed to create k8s client: %w", err)
117+
}
118+
119+
u := o.pipelineRunTemplate(cadNamespace)
120+
121+
err = k8sClient.Create(context.Background(), u)
122+
if err != nil {
123+
return fmt.Errorf("failed to schedule task: %w", err)
124+
}
125+
126+
reportCmd := fmt.Sprintf("'osdctl cluster reports list -C %s -l 1'", o.clusterID)
127+
fmt.Println("Successfully scheduled manual investigation. It can take several minutes until a report is available. Run this command to check the latest report for the results while being connected to the right OCM backplane environment. " + reportCmd)
128+
129+
return nil
130+
}
131+
132+
func (o *cadRunOptions) validate() error {
133+
conn, err := utils.CreateConnection()
134+
if err != nil {
135+
return err
136+
}
137+
defer conn.Close()
138+
139+
if o.clusterID == "" {
140+
return fmt.Errorf("cluster-id is required")
141+
}
142+
143+
validInvestigation := false
144+
for _, v := range validInvestigations {
145+
if o.investigation == v {
146+
validInvestigation = true
147+
break
148+
}
149+
}
150+
if !validInvestigation {
151+
return fmt.Errorf("invalid investigation %q, must be one of: %v", o.investigation, validInvestigations)
152+
}
153+
154+
validEnvironment := false
155+
for _, v := range validEnvironments {
156+
if o.environment == v {
157+
validEnvironment = true
158+
break
159+
}
160+
}
161+
if !validEnvironment {
162+
return fmt.Errorf("invalid environment %q, must be one of: %v", o.environment, validEnvironments)
163+
}
164+
165+
if o.elevationReason == "" {
166+
return fmt.Errorf("elevation reason is required")
167+
}
168+
169+
return nil
170+
}
171+
172+
func (o *cadRunOptions) getCADClusterConfig() (clusterID, namespace string) {
173+
if o.environment == "stage" {
174+
return cadClusterIDStage, "configuration-anomaly-detection-stage"
175+
}
176+
return cadClusterIDProd, "configuration-anomaly-detection-production"
177+
}
178+
179+
func (o *cadRunOptions) pipelineRunTemplate(cadNamespace string) *unstructured.Unstructured {
180+
u := unstructured.Unstructured{}
181+
u.Object = map[string]interface{}{
182+
"apiVersion": "tekton.dev/v1beta1",
183+
"kind": "PipelineRun",
184+
"metadata": map[string]interface{}{
185+
"generateName": "cad-manual-",
186+
"namespace": cadNamespace,
187+
},
188+
"spec": map[string]interface{}{
189+
"params": []map[string]interface{}{
190+
{
191+
"name": "cluster-id",
192+
"value": o.clusterID,
193+
},
194+
{
195+
"name": "investigation",
196+
"value": o.investigation,
197+
},
198+
{
199+
"name": "dry-run",
200+
"value": "false",
201+
},
202+
},
203+
"pipelineRef": map[string]interface{}{
204+
"name": "cad-manual-investigation-pipeline",
205+
},
206+
"serviceAccountName": "cad-sa",
207+
"timeout": "30m",
208+
},
209+
}
210+
211+
u.SetGroupVersionKind(schema.GroupVersionKind{
212+
Group: "tekton.dev",
213+
Version: "v1beta1",
214+
Kind: "PipelineRun",
215+
})
216+
217+
return &u
218+
}

0 commit comments

Comments
 (0)