Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions dags.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2565,3 +2565,15 @@ bqetl_claude_api:
schedule_interval: 0 6 * * *
tags:
- impact/tier_3

bqetl_rust_component_metrics:
schedule_interval: 0 3 * * *
description: The DAG schedules rust component metric queries.
default_args:
owner: sync-team@mozilla.com
start_date: "2026-02-05"
email: ["telemetry-alerts@mozilla.com", "sync-team@mozilla.com"]
retries: 2
retry_delay: 30m
tags:
- impact/tier_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
friendly_name: Shared Rust Components
description: |-
Metrics for shared Rust components
dataset_base_acl: view
user_facing: true
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential/data-viewers
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
friendly_name: Shared Rust Components Derived
description: |-
Derived data for shared Rust components
dataset_base_acl: derived
user_facing: false
labels: {}
default_table_expiration_ms: null
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential/data-viewers
232 changes: 232 additions & 0 deletions sql_generators/rust_component_metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
"""Generate metric aggregates for shared Rust components."""

import os
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from typing import ClassVar

import click

from bigquery_etl.cli.utils import use_cloud_function_option
from bigquery_etl.util.common import render, write_sql

THIS_PATH = Path(os.path.dirname(__file__))

def all_metric_groups() -> list["MetricGroup"]:
"""Glean metrics to aggregate / optimize"""

return [
MetricGroup(
ping="metrics",
category="logins_store",
applications=[
Application.firefox_android,
Application.firefox_ios,
],
metrics=[
Counter("mirror_undecryptable_deleted"),
Event("key_regenerated_corrupt"),
Event("key_regenerated_lost"),
Event("key_regenerated_other"),
],
),
MetricGroup(
ping="metrics",
category="places_manager",
applications=[
Application.firefox_android,
],
metrics=[
Distribution("db_size_after_maintenance", DistributionType.memory),
Distribution("run_maintenance_chk_pnt_time", DistributionType.timing),
Distribution("run_maintenance_optimize_time", DistributionType.timing),
Distribution("run_maintenance_prune_time", DistributionType.timing),
Distribution("run_maintenance_time", DistributionType.timing),
Distribution("run_maintenance_vacuum_time", DistributionType.timing),
],
),
MetricGroup(
ping="metrics",
category="suggest",
applications=[
Application.firefox_desktop,
],
metrics=[
LabeledDistribution("ingest_download_time", DistributionType.timing),
LabeledDistribution("ingest_time", DistributionType.timing),
LabeledDistribution("query_time", DistributionType.timing),
],
),
]

class Application(Enum):
"""Datasets for each application."""

firefox_desktop = "firefox_desktop"
firefox_android = "fenix"
firefox_ios = "firefox_ios"

class DistributionType(Enum):
"""Glean Distribution type."""

timing = auto()
memory = auto()
custom = auto()

@dataclass
class Metric:
"""Base class for metrics that we collect."""
name: str
template_dir: ClassVar[Path]

@dataclass
class Counter(Metric):
template_dir = Path("counter")

@dataclass
class Distribution(Metric):
template_dir = Path("distribution")
type: DistributionType

@dataclass
class LabeledDistribution(Metric):
template_dir = Path("labeled-distribution")
type: DistributionType

@dataclass
class Event(Metric):
template_dir = Path("event")

def get_metric_data(metric: Metric) -> dict[str, str]:
data = {
"name": metric.name
}
match metric:
case Distribution(_, type) | LabeledDistribution(_, type):
table_prefix = ""
if isinstance(metric, LabeledDistribution):
table_prefix = "labeled_"
match type:
case DistributionType.timing:
data["table"] = f"{table_prefix}timing_distribution"
data["unit"] = "nanoseconds"
case DistributionType.memory:
data["table"] = f"{table_prefix}memory_distribution"
data["unit"] = "bytes"
case DistributionType.custom:
data["table"] = f"{table_prefix}custom_distribution"
data["unit"] = ""
return data

@dataclass
class MetricGroup:
"""
Group of metrics to aggregate.

This normally corresponds to a top-level key in the `metrics.yaml` file for a Rust component.
"""

ping: str
"""Name of the Glean ping that contains metrics for this component."""

category: str
"""Metric category, this is the top-level key in the `metrics.yaml` file."""

applications: list[Application]
"""Applications that collect these metrics"""

metrics: list[Metric]
"""Metrics to aggregate in the derived dataset"""

@click.command()
@click.option(
"--output-dir",
"--output_dir",
help="Output directory generated SQL is written to",
type=click.Path(file_okay=False),
default="sql",
)
@click.option(
"--target-project",
"--target_project",
help="Google Cloud project ID",
default="moz-fx-data-shared-prod",
)
@use_cloud_function_option
def generate(target_project, output_dir, use_cloud_function):
"""Generate per-app queries, views and metadata for urlbar events aggregates.

The parent folders will be created if not existing and existing files will be overwritten.
"""
output_dir = Path(output_dir) / target_project

for metric_group in all_metric_groups():
for metric in metric_group.metrics:
full_table_id = f"{target_project}.rust_component_derived.{metric.name}_v1"
full_view_id = f"{target_project}.rust_component_metrics.{metric.name}"
metric_data = get_metric_data(metric)

query_sql_parts = [
render(
f"{metric.template_dir}/query.sql",
template_folder=str(THIS_PATH / "templates"),
application=application.name,
dataset_name=application.value,
ping=metric_group.ping,
category=metric_group.category,
metric=metric_data,
format=True,
)
for application in metric_group.applications
]

write_sql(
output_dir=output_dir,
full_table_id=full_table_id,
basename="query.sql",
sql="\nUNION ALL\n".join(query_sql_parts),
skip_existing=False,
)

write_sql(
output_dir=output_dir,
full_table_id=full_table_id,
basename="metadata.yaml",
sql=render(
f"metadata.yaml",
template_folder=str(THIS_PATH / "templates"),
ping=metric_group.ping,
category=metric_group.category,
metric=metric_data,
format=False,
),
skip_existing=False,
)

write_sql(
output_dir=output_dir,
full_table_id=full_table_id,
basename="schema.yaml",
sql=render(
f"{metric.template_dir}/schema.yaml",
template_folder=str(THIS_PATH / "templates"),
metric=metric_data,
format=False,
),
skip_existing=False,
)

write_sql(
output_dir=output_dir,
full_table_id=full_view_id,
basename="view.sql",
sql=render(
f"view.sql",
template_folder=str(THIS_PATH / "templates"),
full_view_id=full_view_id,
full_table_id=full_table_id,
format=True,
),
skip_existing=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SELECT
DATE(submission_timestamp) AS submission_date,
"{{ application }}" as application,
SUM(metrics.counter.{{ category }}_{{ metric.name }}) as count
FROM `mozdata.{{ dataset_name }}.metrics`
WHERE
DATE(submission_timestamp) = @submission_date AND
metrics.counter.{{ category }}_{{ metric.name }} IS NOT NULL
GROUP BY 1, 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
fields:
- name: submission_date
type: DATE
description: |-
Date the metric was submitted
- name: application
type: STRING
description: |-
Application the metric was collected for
- name: count
type: INTEGER
description: |-
Total count
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
SELECT
submission_date,
"{{ application }}" as application,
q[1] as q001,
q[10] as q01,
q[50] as q05,
q[500] as q50,
q[950] as q95,
q[990] as q99,
q[999] as q999
FROM (
SELECT
DATE(submission_timestamp) AS submission_date,
APPROX_QUANTILES(CAST(values.key AS INT64), 1000) as q
FROM `mozdata.{{ dataset_name }}.metrics`
CROSS JOIN UNNEST(metrics.{{ metric.table }}.{{ category }}_{{ metric.name }}.values) as values
-- This generates multiple rows based on the `value` field. This is needed to make the `APPROX_QUANTILES`
-- weigh `value.key` correctly.
CROSS JOIN UNNEST(GENERATE_ARRAY(1, `values`.value))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the best way to get percentiles for a distribution metric? I tried to find a UDF for this, but couldn't.

GROUP BY 1
)
WHERE
submission_date = @submission_date
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
fields:
- name: submission_date
type: DATE
description: |-
Date the metric was submitted
- name: application
type: STRING
description: |-
Application the metric was collected for
- name: q001
type: INTEGER
description: 0.1th percentile{% if metric.unit %} ({{ metric.unit }}){% endif %}
- name: q01
type: INTEGER
description: 1st percentile{% if metric.unit %} ({{ metric.unit }}){% endif %}
- name: q05
type: INTEGER
description: 5th percentile{% if metric.unit %} ({{ metric.unit }}){% endif %}
- name: q50
type: INTEGER
description: 50th percentile{% if metric.unit %} ({{ metric.unit }}){% endif %}
- name: q95
type: INTEGER
description: 95th percentile{% if metric.unit %} ({{ metric.unit }}){% endif %}
- name: q99
type: INTEGER
description: 99th percentile{% if metric.unit %} ({{ metric.unit }}){% endif %}
- name: q999
type: INTEGER
description: 99.9th percentile{% if metric.unit %} ({{ metric.unit }}){% endif %}
11 changes: 11 additions & 0 deletions sql_generators/rust_component_metrics/templates/event/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
DATE(submission_timestamp) AS submission_date,
"{{ application }}" as application,
COUNT(*) as count
FROM `mozdata.{{ dataset_name }}.events`
CROSS JOIN UNNEST(events) as events
WHERE
events.category = "{{ category }}" AND
events.name = "{{ metric.name }}" AND
DATE(submission_timestamp) = @submission_date
GROUP BY 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
fields:
- name: submission_date
type: DATE
description: |-
Date the metric was submitted
- name: application
type: STRING
description: |-
Application the metric was collected for
- name: count
type: INTEGER
description: |-
Total count
Loading