Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 219 additions & 42 deletions bigquery_etl/cli/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,36 @@ def query(ctx):
default=False,
is_flag=True,
)
@click.option(
"--use_live",
"--use-live",
help=(
"""Using this option creates a query that consists of two tables with
different schedules based on a single base query, one that runs daily
and pulls from stable tables and another that runs more frequently and
pulls from live tables, plus a view that unions the two tables.
"""
),
default=False,
is_flag=True,
)
Comment on lines 156 to 167
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quibble: While the description explains the nuance/meaning of this --use-live option, IMO the option name by itself implies "use live rather than stable".

I'd be inclined to name the option something like --use-live-and-stable or --from-live-and-stable.

@click.option(
"--hourly",
help=(
"""This options is a special case of the --use-live option for
tables that update hourly.

Using this option creates a query that consists of two tables with
different schedules based on a single base query, one that runs daily
and pulls from stable tables and another that runs hourly and
pulls from live tables, plus a view that unions the two tables.
"""
),
default=False,
is_flag=True,
)
Comment on lines 168 to 182
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issues (blocking):

  • While the description explains the nuance/meaning of this --hourly option, IMO it's unnecessarily confusing, as just based on the name I think people could reasonably expect to be able to run bqetl query create --hourly <name> and get an hourly ETL that doesn't necessarily involve a combination of live and stable table sources.
  • This --hourly option is currently making the live ETL table use hourly partitioning, but to date hourly partitioning has been used very rarely (I currently only see one ETL using hourly partitioning), and I don't think it'd be advisable to encourage hourly partitioning due to some drawbacks:
    • It precludes the ETL doing aggregation at the daily level, which is a very common use case.
    • It makes it critical to ensure the success of all hourly ETL tasks to avoid missing any data, which is problematic because frequently running ETLs are more likely have failures (e.g. due to transient issues with BigQuery or GKE), and such ETL failures are prone to getting missed during Airflow triage as by the time triage happens there could easily have been subsequent successful hourly DAG runs.

I'd be inclined to remove this --hourly option and have the generated live ETL simply assume an hourly schedule with daily partitioning, as that's the most common sub-daily ETL setup being used thus far.

@click.pass_context
def create(ctx, name, sql_dir, project_id, owner, dag, no_schedule):
def create(ctx, name, sql_dir, project_id, owner, dag, no_schedule, use_live, hourly):
"""CLI command for creating a new query."""
# create directory structure for query
try:
Expand All @@ -173,49 +201,63 @@ def create(ctx, name, sql_dir, project_id, owner, dag, no_schedule):
)
sys.exit(1)

derived_path = None
view_path = None
create_view_path = False
view_exist_ok = False
path = Path(sql_dir)
if hourly:
use_live = True
use_live_slug = "_hourly"
no_schedule = True
elif use_live:
use_live_slug = "_use_live"
no_schedule = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (non-blocking): I think it would make sense to respect the --dag argument if it's specified (for the main ETL).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The --dag argument is still respected for the daily (non _live) table. Are you thinking that we should treat the live version as the main table? I was thinking the daily version is kind of the main table.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree the daily version should be considered the main table, as is currently the case, but here you're setting no_schedule = True, which means that even if a DAG was specified via --dag it won't be used for the main daily table.


if dataset.endswith("_derived"):
# create directory for this table
derived_path = path / project_id / dataset / (name + version)
derived_path.mkdir(parents=True)

# create a directory for the corresponding view
view_path = path / project_id / dataset.replace("_derived", "") / name
create_view_path = True
# new versions of existing tables may already have a view
view_path.mkdir(parents=True, exist_ok=True)
view_exist_ok = True
else:
# check if there is a corresponding derived dataset
# check if there is a corresponding derived dataset. If so, create
# the view path
if (path / project_id / (dataset + "_derived")).exists():
derived_path = path / project_id / (dataset + "_derived") / (name + version)
derived_path.mkdir(parents=True)
view_path = path / project_id / dataset / name
view_path.mkdir(parents=True)

dataset = dataset + "_derived"
else:
# some dataset that is not specified as _derived
# don't automatically create views
derived_path = path / project_id / dataset / (name + version)
derived_path.mkdir(parents=True)

click.echo(f"Created query in {derived_path}")
create_view_path = True

table_name = name + version
derived_path = path / project_id / dataset / table_name
derived_path.mkdir(parents=True)
if use_live:
use_live_table_name = name + use_live_slug + version
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This results in names like my_table_v1 and my_table_use_live_v1. This is not my favorite because it means the two tables can be alphabetically separated, but it also breaks our current conventions to have anything other than the version at the end of a table name. Improving our naming standards is a larger issue that I'd prefer to not get involved in here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quibble: IMO _use_live reads very awkwardly. I'd be inclined to have the slug just be _live, or maybe _from_live.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about _live but thought that could lead to confusion with the actual live tables. I like _from_live, I'll make that change

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fwiw there are already some queries that follow a similar pattern and do have the _live_v<n> suffix. E.g. experiment_events_live_v1, monitoring_derived.topsites_rate_fenix_release_live_v1 and more

use_live_path = path / project_id / dataset / (name + use_live_slug + version)
use_live_path.mkdir(parents=True)
if create_view_path:
view_path = path / project_id / dataset.replace("_derived", "") / name
view_path.mkdir(parents=True, exist_ok=view_exist_ok)

if view_path and not (view_file := view_path / "view.sql").exists():
if create_view_path and not (view_file := view_path / "view.sql").exists():
# Don't overwrite the view_file if it already exists
click.echo(f"Created corresponding view in {view_path}")
view_dataset = dataset.replace("_derived", "")
view_file.write_text(
reformat(
f"""CREATE OR REPLACE VIEW

if use_live:
view_text = f"""CREATE OR REPLACE VIEW
`{project_id}.{view_dataset}.{name}`
AS SELECT * FROM
`{project_id}.{dataset}.{name}{version}`"""
)
+ "\n"
)
`{project_id}.{dataset}.{table_name}`
UNION ALL
SELECT * FROM
`{project_id}.{dataset}.{use_live_table_name}`
WHERE submission_date > (
SELECT MAX(submission_date)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could potentially create an invalid query if the source table doesn't have submission_date. I think this is fine though. CI will show a failure in the dryrun task and it's reasonable to expect the PR creator to manually adjust the query.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@curtismorales I suspect you used submission_date in these queries because submission_date was being used in the existing query.sql template, but I don't think that's a good approach to continue, as live/stable tables only contain submission_timestamp columns (submission_date columns are only present in derived tables).

I'd suggest taking this opportunity to consistently use submission_timestamp in the generated ETL queries and as the default partitioning column (i.e. presuming the ETL will pass through submission_timestamp, which I think is a safer bet than presuming the ETL is going to have a derived submission_date column).

FROM `{project_id}.{dataset}.{table_name}`
)"""
Comment on lines 247 to 253
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This felt to me like the best way to combine these in the view. This does mean that if the main (daily) table is missing certain days before the most recent, that data will not be filled in by the _use_live table, but that feels appropriate -- a missing day in the stable table is a genuine problem in the data; the _use_live data is ephemeral and should not be used to paper over missing historical data

else:
view_text = f"""CREATE OR REPLACE VIEW
`{project_id}.{view_dataset}.{name}`
AS SELECT * FROM
`{project_id}.{dataset}.{table_name}`"""

view_file.write_text(reformat(view_text) + "\n")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (non-blocking): I know this approach of manually appending a newline is just following the convention of what the existing code was already doing, but it's worth noting that reformat() accepts a trailing_newline boolean argument specifically for that purpose.

Suggested change
view_file.write_text(reformat(view_text) + "\n")
view_file.write_text(reformat(view_text, trailing_newline=True))

(ditto for the other similar usages)


safe_owner = owner.lower().split("@")[0]

Expand All @@ -226,20 +268,108 @@ def create(ctx, name, sql_dir, project_id, owner, dag, no_schedule):
owners=[owner],
labels={"owner": safe_owner},
)

view_metadata.write(view_metadata_file)

click.echo(f"Created view in {view_path}")

# create query.sql file
query_file = derived_path / "query.sql"
query_file.write_text(
reformat(
f"""-- Query for {dataset}.{name}{version}
-- For more information on writing queries see:
-- https://docs.telemetry.mozilla.org/cookbooks/bigquery/querying.html
SELECT * FROM table WHERE submission_date = @submission_date"""
if use_live:
macro_file = path / project_id / dataset / (table_name + "_macros.jinja")
if hourly:
macro_file.write_text(
reformat(
f"""{{% macro {table_name}(use_live) %}}
SELECT
*
FROM
{{% if use_live %}}
table_live
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (non-blocking): IMO it would be best for the generated SQL for selecting from the live table to attempt to deduplicate the records for that date.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The approach here was to not try to generate something that does this for the user because there isn't a clear one-size-fits-all way deduplication method. The closest I'm aware of are:

  1. Wrap the source table in a CTE that dedupes the entire day (e.g., SELECT * FROM live_table WHERE DATE(submission_timestamp) = @submission_timestamp QUALIFY ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY submission_timestamp) = 1) -- this does work in the general case but implies querying the whole day so far in the source table every time, which I don't think we want to make the default behavior
  2. Dedupe just the specific interval and exclude any document_ids already present in the destination table (or in some side table where we save all the document_ids so far) -- requires us to store document_ids in the destination table (or a side table)

I don't think either of these is a great option, and I don't want to give the user the wrong impression that this solves the deduplication problem for them out of the box. The intent instead is that the live and stable versions follow the same model as live and stable tables base tables -- we don't make any promises about deduplication in the live tables. For most low latency purposes (e.g., reports that update several times a day) the slight inaccuracy is not an issue, and the numbers will be fully correct once the daily stable table runs; in these cases, we can not worry about deduplication. If deduping is needed, the user can still write the SQL to do that in the way they prefer.

That said, I do think it makes sense to make this explicit in the generated SQL (not just in documentation elsewhere) -- I'll start with adding a comment to that effect

Copy link
Contributor

@sean-rose sean-rose Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrap the source table in a CTE that dedupes the entire day (e.g., SELECT * FROM live_table WHERE DATE(submission_timestamp) = @submission_timestamp QUALIFY ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY submission_timestamp) = 1) -- this does work in the general case but implies querying the whole day so far in the source table every time, which I don't think we want to make the default behavior

That appears to be the most common behavior though, which to me is a decent argument for making it the default behavior:

  • In bigquery-etl I currently see 6 ETLs selecting from live tables (counting the generated baseline_clients_city_seen_v1 ETLs as one ETL), all of which query the whole day, and 5 of which do deduplication.
  • In private-bigquery-etl I currently see 12 ETLs selecting from live tables, 9 of which query the whole day, and all of which attempt some form of deduplication.

I agree there's no great option, but as I mentioned in a couple other comments, I don't think it would be a good idea to set things up so that absolutely all hourly ETL runs have to succeed to get complete data, because transient issues do happen and Airflow triage isn't great at catching failures in DAGs that run more frequently than daily. IMO querying the whole day and overwriting the corresponding date partition is the most straightforward way to avoid that issue. The main downsides are that's less efficient, and you still need to be careful that the last ETL run targeting a particular date partition was successful to avoid missing data.

Another approach that could work in some use cases is having the query only select records after the max submission_timestamp the ETL has already seen thus far, and then appending newer records to the table rather than overwriting a partition. This does have the benefits of being more efficient and completely self-healing in the event of transient failures (any successful ETL run will fix the entire table). However, it requires the submission_timestamp to be included in the ETL table, makes it a self-referential ETL (which are trickier to deal with in certain scenarios), and precludes the ability to do the same deduplication as is done for the stable tables.

@scholtzan do you have an opinion on this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For most low latency purposes (e.g., reports that update several times a day) the slight inaccuracy is not an issue, and the numbers will be fully correct once the daily stable table runs

Afaik, this is the case for many use cases I have seen. Small differences don't seem like a big issue, though if we have a good approach to make these numbers more accurate, that would be a plus.

Maybe we can have a separate parameter --deduplicate that adds the deduplication logic using the document_id (option 1 that Curtis mentioned earlier)?

Another approach that could work in some use cases is having the query only select records after the max submission_timestamp

I don't think it's guaranteed that new records have an increasing submission_timestamp. Afaik it's possible that records with an earlier submission_timestamps get added to the tables after records with later submission_timestamp. This could in the worst case result in missing records if we follow this approach.

{{% else %}}
table_stable
{{% endif %}}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quibble: I think it'd be more readable to have these placeholders match how we refer to live/stable tables.

Suggested change
{{% if use_live %}}
table_live
{{% else %}}
table_stable
{{% endif %}}
{{% if use_live %}}
live_table
{{% else %}}
stable_table
{{% endif %}}

WHERE
{{% if use_live %}}
TIMESTAMP_TRUNC(submission_timestamp, HOUR) = @submission_hour
{{% else %}}
TIMESTAMP_TRUNC(submission_date, DAY) = @submission_date
{{% endif %}}
{{% endmacro %}}"""
)
+ "\n"
)
else:
macro_file.write_text(
reformat(
f"""{{% macro {table_name}(use_live) %}}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we can only partition by day or by hour, there isn't really a simple standard SQL for this case. This is what I came up with, but I'm open to other suggestions

SELECT
*
FROM
{{% if use_live %}}
table_live
{{% else %}}
table_stable
{{% endif %}}
WHERE
{{% if use_live %}}
submission_timestamp >= @interval_start
AND submission_timestamp < @interval_end
{{% else %}}
TIMESTAMP_TRUNC(submission_date, DAY) = @submission_date
{{% endif %}}
{{% if use_live %}}
-- Overwrite the daily partition with a combination of new records for
-- the given interval (above) and existing records outside the given
-- interval (below)
UNION ALL
SELECT
*
FROM
{project_id}.{dataset}.{use_live_table_name}
WHERE
TIMESTAMP_TRUNC(submission_date, DAY) = TIMESTAMP_TRUNC(@interval_start, DAY)
AND (
submission_timestamp < @interval_start
OR submission_timestamp >= @interval_end
)
{{% endif %}}
Comment on lines 318 to 340
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • issue: submission_date wouldn't be a timestamp.
  • suggestion (non-blocking): It would be best to quote the live ETL table name.
  • note: WhiIe this approach of only running the live ETL on new records and reusing existing records is likely better for performance, it unfortunately has the same drawbacks as when using hourly partitioning that I mentioned in my comment about the --hourly command line option. I'd be inclined to run the live ETL on all records for the date in question.

{{% endmacro %}}"""
)
+ "\n"
)
click.echo(f"Created base query in {macro_file}")
query_file = derived_path / "query.sql"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be worth creating a QUERY_FILE = "query.sql" here:

MATERIALIZED_VIEW = "materialized_view.sql"
since there are multiple usages.
Similar to what is done for materialized_view.sql etc

query_file.write_text(
reformat(
f"""-- Query for {dataset}.{table_name}
-- For more information on writing queries see:
-- https://docs.telemetry.mozilla.org/cookbooks/bigquery/querying.html
{{% from "{macro_file}" import {table_name} %}}
{{{{ {table_name}(use_live=false) }}}}"""
)
+ "\n"
)
use_live_file = use_live_path / "query.sql"
use_live_file.write_text(
reformat(
f"""-- Query for {dataset}.{use_live_table_name}
-- For more information on writing queries see:
-- https://docs.telemetry.mozilla.org/cookbooks/bigquery/querying.html
{{% from "{macro_file}" import {table_name} %}}
{{{{ {table_name}(use_live=true) }}}}"""
)
+ "\n"
)
else:
query_file = derived_path / "query.sql"
query_file.write_text(
reformat(
f"""-- Query for {dataset}.{name}{version}
-- For more information on writing queries see:
-- https://docs.telemetry.mozilla.org/cookbooks/bigquery/querying.html
SELECT * FROM table WHERE submission_date = @submission_date"""
)
+ "\n"
)
+ "\n"
)

# create default metadata.yaml
metadata_file = derived_path / "metadata.yaml"
Expand All @@ -255,6 +385,53 @@ def create(ctx, name, sql_dir, project_id, owner, dag, no_schedule):
require_column_descriptions=True,
)
metadata.write(metadata_file)
click.echo(f"Created query in {derived_path}")

if use_live:
use_live_metadata_file = use_live_path / "metadata.yaml"
if hourly:
labels = {"incremental": True, "schedule": hourly}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (blocking):

Suggested change
labels = {"incremental": True, "schedule": hourly}
labels = {"incremental": True, "schedule": "hourly"}

time_partitioning = PartitionMetadata(
field="submission_hour",
type=PartitionType.HOUR,
expiration_days=30,
)
parameters = [
"submission_hour:DATE:{{(execution_date - macros.timedelta(hours=1)).strftime('%Y-%m-%d %h:00:00')}}",
]
destination_table = f"{use_live_table_name}${{{{(execution_date - macros.timedelta(hours=1)).strftime('%Y%m%d%h)}}}}"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These will go away once the logic for calculating these parameters is in place

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: Subtracting an hour like this isn't always necessary, especially if the hourly ETLs aren't scheduled at the top of the hour.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (blocking):

Suggested change
parameters = [
"submission_hour:DATE:{{(execution_date - macros.timedelta(hours=1)).strftime('%Y-%m-%d %h:00:00')}}",
]
destination_table = f"{use_live_table_name}${{{{(execution_date - macros.timedelta(hours=1)).strftime('%Y%m%d%h)}}}}"
parameters = [
"submission_hour:TIMESTAMP:{{(execution_date - macros.timedelta(hours=1)).strftime('%Y-%m-%d %H:00:00')}}",
]
destination_table = f"{use_live_table_name}${{{{(execution_date - macros.timedelta(hours=1)).strftime('%Y%m%d%H)}}}}"

else:
labels = {"incremental": True}
time_partitioning = PartitionMetadata(
field="",
type=PartitionType.DAY,
expiration_days=30,
)
parameters = [
"interval_start:DATE:{{}}",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should interval_start be set to something for now?

"interval_end:DATE:{{(execution_date - macros.timedelta(hours=1).strftime('%Y-%m-%d %h:%m:%s'))}}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (blocking):

Suggested change
"interval_start:DATE:{{}}",
"interval_end:DATE:{{(execution_date - macros.timedelta(hours=1).strftime('%Y-%m-%d %h:%m:%s'))}}",
"interval_start:TIMESTAMP:{{}}",
"interval_end:TIMESTAMP:{{(execution_date - macros.timedelta(hours=1).strftime('%Y-%m-%d %H:%M:%S'))}}",

Though I'm not sure subtracting an extra hour would necessarily always be advisable for arbitrary intervals.

]
destination_table = f"{use_live_table_name}${{{{(execution_date - macros.timedelta(hours=1)).strftime('%Y%m%d)}}}}"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

use_live_metadata = Metadata(
friendly_name=string.capwords((name + use_live_slug).replace("_", " ")),
description="Please provide a description for the query",
owners=[owner],
labels=labels,
scheduling={
"dag_name": None,
"date_partition_parameter": None,
"parameters": parameters,
"destination_table": destination_table,
"query_file_path": use_live_path / "query.sql",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (blocking): Specifying query_file_path like this isn't actually necessary and makes maintenance harder.

Suggested change
"query_file_path": use_live_path / "query.sql",

},
bigquery=BigQueryMetadata(
time_partitioning=time_partitioning,
clustering=ClusteringMetadata(fields=[]),
),
require_column_descriptions=True,
)
use_live_metadata.write(use_live_metadata_file)
click.echo(f"Created use_live query in {use_live_path}")

dataset_metadata_file = derived_path.parent / "dataset_metadata.yaml"
if not dataset_metadata_file.exists():
Expand All @@ -268,7 +445,7 @@ def create(ctx, name, sql_dir, project_id, owner, dag, no_schedule):
dataset_metadata.write(dataset_metadata_file)
click.echo(f"Created dataset metadata in {dataset_metadata_file}")

if view_path:
if create_view_path:
dataset_metadata_file = view_path.parent / "dataset_metadata.yaml"
if not dataset_metadata_file.exists():
dataset_name = str(dataset_metadata_file.parent.name)
Expand Down
62 changes: 62 additions & 0 deletions tests/cli/test_cli_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,68 @@ def test_create_query_in_named_dag(self, runner):
exists = "dag_name: bqetl_test" in file.read()
assert exists

def test_create_use_live_query(self, runner):
with runner.isolated_filesystem():
os.makedirs("sql/moz-fx-data-shared-prod/test_derived")
result = runner.invoke(create, ["test.test_query", "--use_live"])
assert result.exit_code == 0
assert sorted(os.listdir("sql/moz-fx-data-shared-prod")) == [
"test",
"test_derived",
]
assert sorted(os.listdir("sql/moz-fx-data-shared-prod/test_derived")) == [
"dataset_metadata.yaml",
"test_query_use_live_v1",
"test_query_v1",
"test_query_v1_macros.jinja",
]
assert sorted(
os.listdir("sql/moz-fx-data-shared-prod/test_derived/test_query_v1")
) == ["metadata.yaml", "query.sql"]
assert sorted(
os.listdir(
"sql/moz-fx-data-shared-prod/test_derived/test_query_use_live_v1"
)
) == ["metadata.yaml", "query.sql"]
assert sorted(os.listdir("sql/moz-fx-data-shared-prod/test")) == [
"dataset_metadata.yaml",
"test_query",
]
assert sorted(
os.listdir("sql/moz-fx-data-shared-prod/test/test_query")
) == ["metadata.yaml", "view.sql"]

def test_create_hourly_query(self, runner):
with runner.isolated_filesystem():
os.makedirs("sql/moz-fx-data-shared-prod/test_derived")
result = runner.invoke(create, ["test.test_query", "--hourly"])
assert result.exit_code == 0
assert sorted(os.listdir("sql/moz-fx-data-shared-prod")) == [
"test",
"test_derived",
]
assert sorted(os.listdir("sql/moz-fx-data-shared-prod/test_derived")) == [
"dataset_metadata.yaml",
"test_query_hourly_v1",
"test_query_v1",
"test_query_v1_macros.jinja",
]
assert sorted(
os.listdir("sql/moz-fx-data-shared-prod/test_derived/test_query_v1")
) == ["metadata.yaml", "query.sql"]
assert sorted(
os.listdir(
"sql/moz-fx-data-shared-prod/test_derived/test_query_hourly_v1"
)
) == ["metadata.yaml", "query.sql"]
assert sorted(os.listdir("sql/moz-fx-data-shared-prod/test")) == [
"dataset_metadata.yaml",
"test_query",
]
assert sorted(
os.listdir("sql/moz-fx-data-shared-prod/test/test_query")
) == ["metadata.yaml", "view.sql"]

def test_create_query_with_version(self, runner):
with runner.isolated_filesystem():
os.makedirs("sql/moz-fx-data-shared-prod")
Expand Down