From 0d6680f49d4ac178e32ca3b75bcd03bbb692999f Mon Sep 17 00:00:00 2001 From: pokom Date: Wed, 12 Mar 2025 20:15:26 -0400 Subject: [PATCH] Add query to calculate cost of PV change with CloudCost exporter metrics This ones a bit more of a challenge then CPU/Memory, due to three problems: 1. Cloudcost exporter does not emit metrics for persistent volumes for Azure(https://github.com/grafana/cloudcost-exporter/issues/236) 2. AWS ebs cost metrics does not have a cluster label(https://github.com/grafana/cloudcost-exporter/issues/450) 3. persisent volumes in GKE and EKS emit the total hourly cost of the volume, _not_ the hourly cost per GiB I utilized Prometheus or ooperator(https://prometheus.io/docs/prometheus/latest/querying/operators/#logical-set-binary-operators) to overcome not having Azure pv costs. Effectively the query will attempt to find the average cost of pvs for 1. eks volumes via CloudCost Exporter 2. gke volumes via CloudCost Exporter 3. azure volumes via OpenCost This works because we're only querying one cluster at a time _by name_, and we rely upon the fact that cluster names are unique within Grafana Labs infrastructure. The missing cluster label for eks cost metrics and persistent volumes not having cluster labels can be overcome by utilizing `kube_persistentvolume_capacity_bytes` metrics emitted by kube-state-metrics. This was tested by looking at an EKS cluster like so: ```shell go run ./cmd/estimator/ \ -use.cloud.cost.exporter.metrics=true -from $PWD/pkg/costmodel/testdata/resource/StatefulSet.json \ -to $PWD/pkg/costmodel/testdata/resource/StatefulSet-more-storage.json \ -http.config.file ~/.config/dev.yaml \ -prometheus.address $PROMETHEUS_ADDRESS \ dev-us-east-0 ``` --- pkg/costmodel/client.go | 30 +++- .../resource/StatefulSet-more-storage.json | 147 ++++++++++++++++++ 2 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 pkg/costmodel/testdata/resource/StatefulSet-more-storage.json diff --git a/pkg/costmodel/client.go b/pkg/costmodel/client.go index 797c2a1..66f99cb 100644 --- a/pkg/costmodel/client.go +++ b/pkg/costmodel/client.go @@ -67,7 +67,31 @@ avg by (spot) (node_ram_hourly_cost{cluster="%s"} cloudcost_gcp_gke_instance_memory_usd_per_gib_hour{cluster_name="%s"} ) ` - queryPersistentVolumeCost = "avg_over_time(avg(pv_hourly_cost{cluster=\"%s\"})[24h:1m])" + queryPersistentVolumeCost = ` +avg_over_time( + avg( + pv_hourly_cost{cluster="%s"} + )[24h:1m] +)` + cloudcostQueryPersistentVolumeCost = ` + avg( + cloudcost_aws_ec2_persistent_volume_usd_per_hour{persistentvolume!="", state="in-use"} + / on (persistentvolume) group_left() ( + kube_persistentvolume_capacity_bytes{cluster="%s"} / 1e9 + ) + ) + or + avg( + cloudcost_gcp_gke_persistent_volume_usd_per_hour{persistentvolume!="", use_status="in-use", cluster_name="%s"} + / on (persistentvolume) group_left() ( + kube_persistentvolume_capacity_bytes{cluster="%s"} / 1e9 + ) + ) + or + avg( + pv_hourly_cost{cluster="%s"} + ) +` queryAverageNodeCount = ` avg_over_time( @@ -212,6 +236,10 @@ func (c *Client) GetNodeCount(ctx context.Context, cluster string) (int, error) // GetCostForPersistentVolume returns the average cost per persistent volume for a given cluster func (c *Client) GetCostForPersistentVolume(ctx context.Context, cluster string) (Cost, error) { query := fmt.Sprintf(queryPersistentVolumeCost, cluster) + if c.useCloudCostExporterMetrics { + slog.Info("GetCostForPersistentVolume", "cluster", cluster, "message", "using cloudcost exporter metrics") + query = fmt.Sprintf(cloudcostQueryPersistentVolumeCost, cluster, cluster, cluster, cluster) + } results, err := c.query(ctx, query) if err != nil { return Cost{}, err diff --git a/pkg/costmodel/testdata/resource/StatefulSet-more-storage.json b/pkg/costmodel/testdata/resource/StatefulSet-more-storage.json new file mode 100644 index 0000000..e82248d --- /dev/null +++ b/pkg/costmodel/testdata/resource/StatefulSet-more-storage.json @@ -0,0 +1,147 @@ +{ + "apiVersion": "apps/v1", + "kind": "StatefulSet", + "metadata": { + "creationTimestamp": "2022-10-05T20:38:31Z", + "generation": 17, + "labels": { + "kustomize.toolkit.fluxcd.io/name": "kube-manifests-opencost", + "kustomize.toolkit.fluxcd.io/namespace": "opencost", + "tanka.dev/environment": "85ead74422d749cb54711e74c81bc5d6ed6da54e92b5fa69" + }, + "name": "opencost", + "namespace": "opencost", + "resourceVersion": "2386985939", + "uid": "56495ef8-2650-46e8-9528-28759cf47151" + }, + "spec": { + "podManagementPolicy": "OrderedReady", + "replicas": 1, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "name": "opencost" + } + }, + "serviceName": "opencost", + "template": { + "metadata": { + "creationTimestamp": null, + "labels": { + "name": "opencost" + } + }, + "spec": { + "affinity": { + "nodeAffinity": { + "preferredDuringSchedulingIgnoredDuringExecution": [ + { + "preference": { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-spot", + "operator": "In", + "values": [ + "true" + ] + } + ] + }, + "weight": 100 + } + ] + } + }, + "containers": [ + { + "env": [ + ], + "image": "quay.io/kubecost1/kubecost-cost-model:prod-1.100.0", + "imagePullPolicy": "IfNotPresent", + "name": "opencost", + "ports": [ + { + "containerPort": 9003, + "name": "http-metrics", + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "cpu": "4", + "memory": "8Gi" + }, + "requests": { + "cpu": "1", + "memory": "4Gi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/configs", + "name": "opencost-data" + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": { + "fsGroup": 10001 + }, + "serviceAccount": "opencost", + "serviceAccountName": "opencost", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "type", + "operator": "Equal", + "value": "spot-node" + } + ] + } + }, + "updateStrategy": { + "type": "RollingUpdate" + }, + "volumeClaimTemplates": [ + { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "creationTimestamp": null, + "name": "opencost-data" + }, + "spec": { + "accessModes": [ + "ReadWriteOnce" + ], + "resources": { + "requests": { + "storage": "320Gi" + } + }, + "volumeMode": "Filesystem" + }, + "status": { + "phase": "Pending" + } + } + ] + }, + "status": { + "availableReplicas": 1, + "collisionCount": 0, + "currentReplicas": 1, + "currentRevision": "opencost-6666f8bdb7", + "observedGeneration": 17, + "readyReplicas": 1, + "replicas": 1, + "updateRevision": "opencost-6666f8bdb7", + "updatedReplicas": 1 + } +}