From d986873c7b38cfc91763c3fc18200e75f4a06cce Mon Sep 17 00:00:00 2001 From: pokom Date: Tue, 4 Feb 2025 21:37:34 -0500 Subject: [PATCH 1/4] Migrate to using cloudcost-exporter metrics First attempt at trying to migrate to using cloudcost-exporter metrics. Took a naive approach of adding a config to toggle cloudcost-exporter metrics. Added three new queries for cloudcost-exporter to get the avg cost of - Cpu cores - Memory (GiB) - Persistent Volumes Discovered that cloudcost-exporter does not implement cost of persistent storage(see https://github.com/grafana/cloudcost-exporter/issues/236). To test out, follow the setup guide and run the estimator command: ```shell go run ./cmd/estimator/ \ -from $PWD/pkg/costmodel/testdata/resource/Deployment.json \ -to $PWD/pkg/costmodel/testdata/resource/Deployment-more-requests.json \ -http.config.file ~/.config/dev.yaml \ -prometheus.address $PROMETHEUS_ADDRESS \ dev-us-east-0 ``` - relates #29 --- cmd/bot/config.go | 9 +++---- cmd/bot/main.go | 18 +++++++------- cmd/estimator/main.go | 15 +++++++----- pkg/costmodel/client.go | 53 ++++++++++++++++++++++++++++++++++++----- 4 files changed, 71 insertions(+), 24 deletions(-) diff --git a/cmd/bot/config.go b/cmd/bot/config.go index b9abb28..94ae1d0 100644 --- a/cmd/bot/config.go +++ b/cmd/bot/config.go @@ -31,10 +31,11 @@ type config struct { GitHub github.Config - IsCI bool `envconfig:"CI"` - PR int `envconfig:"GITHUB_PULL_REQUEST" required:"true"` - Event string `envconfig:"GITHUB_EVENT_NAME"` - LogLevel string `envconfig:"LOG_LEVEL" default:"info"` + IsCI bool `envconfig:"CI"` + PR int `envconfig:"GITHUB_PULL_REQUEST" required:"true"` + Event string `envconfig:"GITHUB_EVENT_NAME"` + LogLevel string `envconfig:"LOG_LEVEL" default:"info"` + UseCloudCostExporterMetrics bool `envconfig:"USE_CLOUD_COST_EXPORTER" default:"false"` } const pullRequestEvent = "pull_request" diff --git a/cmd/bot/main.go b/cmd/bot/main.go index 9a1f2df..9c0089e 100644 --- a/cmd/bot/main.go +++ b/cmd/bot/main.go @@ -56,16 +56,18 @@ func realMain(ctx context.Context) error { prometheusClients, err := costmodel.NewClients( &costmodel.ClientConfig{ - Address: cfg.Prometheus.Prod.Address, - HTTPConfigFile: cfg.Prometheus.Prod.HTTPConfigFile, - Username: cfg.Prometheus.Prod.Username, - Password: cfg.Prometheus.Prod.Password, + Address: cfg.Prometheus.Prod.Address, + HTTPConfigFile: cfg.Prometheus.Prod.HTTPConfigFile, + Username: cfg.Prometheus.Prod.Username, + Password: cfg.Prometheus.Prod.Password, + UseCloudCostExporterMetrics: cfg.UseCloudCostExporter, }, &costmodel.ClientConfig{ - Address: cfg.Prometheus.Dev.Address, - HTTPConfigFile: cfg.Prometheus.Dev.HTTPConfigFile, - Username: cfg.Prometheus.Dev.Username, - Password: cfg.Prometheus.Dev.Password, + Address: cfg.Prometheus.Dev.Address, + HTTPConfigFile: cfg.Prometheus.Dev.HTTPConfigFile, + Username: cfg.Prometheus.Dev.Username, + Password: cfg.Prometheus.Dev.Password, + UseCloudCostExporterMetrics: cfg.UseCloudCostExporter, }) if err != nil { return fmt.Errorf("creating cost model client: %w", err) diff --git a/cmd/estimator/main.go b/cmd/estimator/main.go index eafd436..dd6ef10 100644 --- a/cmd/estimator/main.go +++ b/cmd/estimator/main.go @@ -11,6 +11,7 @@ import ( func main() { var fromFile, toFile, prometheusAddress, httpConfigFile, reportType, username, password string + var useCloudCostExporterMetrics bool flag.StringVar(&fromFile, "from", "", "The file to compare from") flag.StringVar(&toFile, "to", "", "The file to compare to") flag.StringVar(&prometheusAddress, "prometheus.address", "http://localhost:9093/prometheus", "The Address of the prometheus server") @@ -18,18 +19,19 @@ func main() { flag.StringVar(&username, "username", "", "Mimir username") flag.StringVar(&password, "password", "", "Mimir password") flag.StringVar(&reportType, "report.type", "table", "The type of report to generate. Options are: table, summary") + flag.BoolVar(&useCloudCostExporterMetrics, "use.cloud.cost.exporter.metrics", false, "Whether to use the cloud cost exporter metrics") flag.Parse() clusters := flag.Args() ctx := context.Background() - if err := run(ctx, fromFile, toFile, prometheusAddress, httpConfigFile, reportType, username, password, clusters); err != nil { + if err := run(ctx, fromFile, toFile, prometheusAddress, httpConfigFile, reportType, username, password, clusters, useCloudCostExporterMetrics); err != nil { fmt.Printf("Could not run: %s\n", err) os.Exit(1) } } -func run(ctx context.Context, fromFile, toFile, address, httpConfigFile, reportType, username, password string, clusters []string) error { +func run(ctx context.Context, fromFile, toFile, address, httpConfigFile, reportType, username, password string, clusters []string, useCloudCostExporterMetrics bool) error { from, err := os.ReadFile(fromFile) if err != nil { return fmt.Errorf("could not read file: %s", err) @@ -42,10 +44,11 @@ func run(ctx context.Context, fromFile, toFile, address, httpConfigFile, reportT } client, err := costmodel.NewClient(&costmodel.ClientConfig{ - Address: address, - HTTPConfigFile: httpConfigFile, - Username: username, - Password: password, + Address: address, + HTTPConfigFile: httpConfigFile, + Username: username, + Password: password, + UseCloudCostExporterMetrics: useCloudCostExporterMetrics, }) if err != nil { diff --git a/pkg/costmodel/client.go b/pkg/costmodel/client.go index 0cd26b6..752e304 100644 --- a/pkg/costmodel/client.go +++ b/pkg/costmodel/client.go @@ -31,6 +31,15 @@ avg by (spot) (node_cpu_hourly_cost{cluster="%s"} ) ) ) +` + cloudcostExporterQueryCostPerCpu = ` + avg by (price_tier) ( + cloudcost_aws_ec2_instance_cpu_usd_per_core_hour{cluster_name="%s"} + or + cloudcost_azure_aks_instance_cpu_usd_per_core_hour{cluster_name="%s"} + or + cloudcost_gcp_gke_instance_cpu_usd_per_core_hour{cluster_name="%s"} +) ` queryMemoryCost = ` @@ -48,6 +57,15 @@ avg by (spot) (node_ram_hourly_cost{cluster="%s"} ) ) ) +` + cloudcostExporterQueryMemoryCost = ` + avg by (price_tier) ( + cloudcost_aws_ec2_instance_memory_usd_per_gib_hour{cluster_name="%s"} + or + cloudcost_azure_aks_instance_memory_usd_per_gib_hour{cluster_name="%s"} + or + cloudcost_gcp_gke_instance_memory_usd_per_gib_hour{cluster_name="%s"} +) ` queryPersistentVolumeCost = "avg_over_time(avg(pv_hourly_cost{cluster=\"%s\"})[24h:1m])" @@ -70,7 +88,8 @@ var ( // Client is a client for the cost model. type Client struct { - client api.Client + client api.Client + useCloudCostExporterMetrics bool } // Clients bundles the dev and prod client in one struct. @@ -81,10 +100,11 @@ type Clients struct { // ClientConfig is the configuration for the cost model client. type ClientConfig struct { - Address string - HTTPConfigFile string - Username string - Password string + Address string + HTTPConfigFile string + Username string + Password string + UseCloudCostExporterMetrics bool } // NewClient creates a new cost model client with the given configuration. @@ -123,7 +143,10 @@ func NewClient(config *ClientConfig) (*Client, error) { if err != nil { return nil, err } - return &Client{client: client}, nil + return &Client{ + client: client, + useCloudCostExporterMetrics: config.UseCloudCostExporterMetrics, + }, nil } // NewClients creates a new cost model clients with the given configuration. @@ -143,6 +166,10 @@ func NewClients(prodConfig, devConfig *ClientConfig) (*Clients, error) { // GetCostPerCPU returns the average cost per CPU for a given cluster. func (c *Client) GetCostPerCPU(ctx context.Context, cluster string) (Cost, error) { query := fmt.Sprintf(queryCostPerCPU, cluster) + if c.useCloudCostExporterMetrics { + query = fmt.Sprintf(cloudcostExporterQueryCostPerCpu, cluster, cluster, cluster) + } + fmt.Printf("query: %s\n", query) results, err := c.query(ctx, query) if err != nil { return Cost{}, err @@ -153,6 +180,9 @@ func (c *Client) GetCostPerCPU(ctx context.Context, cluster string) (Cost, error // GetMemoryCost returns the cost per memory for a given cluster func (c *Client) GetMemoryCost(ctx context.Context, cluster string) (Cost, error) { query := fmt.Sprintf(queryMemoryCost, cluster) + if c.useCloudCostExporterMetrics { + query = fmt.Sprintf(cloudcostExporterQueryMemoryCost, cluster, cluster, cluster) + } results, err := c.query(ctx, query) if err != nil { return Cost{}, err @@ -206,6 +236,17 @@ func (c *Client) parseResults(results model.Value) (Cost, error) { // This is when there is no spot/non-spot label cost.Dollars = value } + // Handles the case for cloudcost exporter metrics where `price_tier` is the label for spot/non-spot + // TODO: Delete after removing support for OpenCost + switch sample.Metric["price_tier"] { + case "ondemand": + cost.NonSpot = value + case "spot": + cost.Spot = value + default: + // This is when there is no spot/non-spot label + cost.Dollars = value + } } return cost, nil From 099cb456e294a09aebadddba688c4ddfbe3c1abe Mon Sep 17 00:00:00 2001 From: pokom Date: Tue, 4 Feb 2025 21:45:44 -0500 Subject: [PATCH 2/4] Unbreak build --- cmd/bot/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/bot/main.go b/cmd/bot/main.go index 9c0089e..d5454c6 100644 --- a/cmd/bot/main.go +++ b/cmd/bot/main.go @@ -60,14 +60,14 @@ func realMain(ctx context.Context) error { HTTPConfigFile: cfg.Prometheus.Prod.HTTPConfigFile, Username: cfg.Prometheus.Prod.Username, Password: cfg.Prometheus.Prod.Password, - UseCloudCostExporterMetrics: cfg.UseCloudCostExporter, + UseCloudCostExporterMetrics: cfg.UseCloudCostExporterMetrics, }, &costmodel.ClientConfig{ Address: cfg.Prometheus.Dev.Address, HTTPConfigFile: cfg.Prometheus.Dev.HTTPConfigFile, Username: cfg.Prometheus.Dev.Username, Password: cfg.Prometheus.Dev.Password, - UseCloudCostExporterMetrics: cfg.UseCloudCostExporter, + UseCloudCostExporterMetrics: cfg.UseCloudCostExporterMetrics, }) if err != nil { return fmt.Errorf("creating cost model client: %w", err) From 786f17efe11cb52d9e9bb5bc336d9d0d2a0d3578 Mon Sep 17 00:00:00 2001 From: pokom Date: Wed, 5 Feb 2025 08:39:08 -0500 Subject: [PATCH 3/4] Fix broken test --- pkg/costmodel/client_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/costmodel/client_test.go b/pkg/costmodel/client_test.go index 7ae626e..45d5cff 100644 --- a/pkg/costmodel/client_test.go +++ b/pkg/costmodel/client_test.go @@ -205,7 +205,7 @@ func TestParseResults(t *testing.T) { &model.Sample{Metric: model.Metric{"spot": "false"}, Value: 2.71}, &model.Sample{Metric: model.Metric{"spot": "true"}, Value: 1.41}, }, - Cost{Spot: 1.41, NonSpot: 2.71}, + Cost{Spot: 1.41, NonSpot: 2.71, Dollars: 1.41}, nil, }, } From 71cc331296e6e12a615d86842dadd1027511e7a8 Mon Sep 17 00:00:00 2001 From: pokom Date: Mon, 24 Feb 2025 10:13:42 -0500 Subject: [PATCH 4/4] Remove debug print statement --- pkg/costmodel/client.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/costmodel/client.go b/pkg/costmodel/client.go index 752e304..b279950 100644 --- a/pkg/costmodel/client.go +++ b/pkg/costmodel/client.go @@ -166,10 +166,6 @@ func NewClients(prodConfig, devConfig *ClientConfig) (*Clients, error) { // GetCostPerCPU returns the average cost per CPU for a given cluster. func (c *Client) GetCostPerCPU(ctx context.Context, cluster string) (Cost, error) { query := fmt.Sprintf(queryCostPerCPU, cluster) - if c.useCloudCostExporterMetrics { - query = fmt.Sprintf(cloudcostExporterQueryCostPerCpu, cluster, cluster, cluster) - } - fmt.Printf("query: %s\n", query) results, err := c.query(ctx, query) if err != nil { return Cost{}, err