diff --git a/.gitignore b/.gitignore index e9402e1b..f403247a 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,5 @@ cython_debug/ .idea test_artifacts/ artifacts/ + +pylintrc diff --git a/rasgotransforms/rasgotransforms/macros/bigquery/aggregate_metrics.sql b/rasgotransforms/rasgotransforms/macros/bigquery/aggregate_metrics.sql index d530a567..369262cf 100644 --- a/rasgotransforms/rasgotransforms/macros/bigquery/aggregate_metrics.sql +++ b/rasgotransforms/rasgotransforms/macros/bigquery/aggregate_metrics.sql @@ -1,4 +1,3 @@ -{% from 'filter.sql' import get_filter_statement %} {% from 'secondary_calculation.sql' import render_secondary_calculations %} {% macro calculate_timeseries_metric_values( diff --git a/rasgotransforms/rasgotransforms/macros/distinct_values.sql b/rasgotransforms/rasgotransforms/macros/distinct_values.sql index 87df3433..0f278fed 100644 --- a/rasgotransforms/rasgotransforms/macros/distinct_values.sql +++ b/rasgotransforms/rasgotransforms/macros/distinct_values.sql @@ -1,5 +1,3 @@ -{% from 'filter.sql' import get_filter_statement %} - {% macro get_distinct_vals( columns, target_metric, diff --git a/rasgotransforms/rasgotransforms/macros/expression_metrics.sql b/rasgotransforms/rasgotransforms/macros/expression_metrics.sql index 1f753cf7..3d17c682 100644 --- a/rasgotransforms/rasgotransforms/macros/expression_metrics.sql +++ b/rasgotransforms/rasgotransforms/macros/expression_metrics.sql @@ -1,5 +1,4 @@ {% from 'aggregate_metrics.sql' import calculate_timeseries_metric_values %} -{% from 'filter.sql' import combine_filters %} {% from 'secondary_calculation.sql' import render_secondary_calculations %} {% macro calculate_expression_metric_values( diff --git a/rasgotransforms/rasgotransforms/macros/filter.sql b/rasgotransforms/rasgotransforms/macros/filter.sql index 1b47d8c2..e69de29b 100644 --- a/rasgotransforms/rasgotransforms/macros/filter.sql +++ b/rasgotransforms/rasgotransforms/macros/filter.sql @@ -1,59 +0,0 @@ -{% macro get_filter_statement(filters) %} -{% if filters %} -{% if filters is string %} -{{ filters }} -{% else %} -{% set logical_operator = namespace(value='AND') %} -{% for filter in filters %} - {% if filter is not string %} - {% if filter is not mapping %} - {% set filter = dict(filter) %} - {% endif %} - {% if filter is mapping and 'compoundBoolean' in filter and filter['compoundBoolean'] %} - {% set logical_operator.value = filter['compoundBoolean'] %} - {% endif %} - {% endif %} -{% endfor %} -( - {% for filter in filters %} - {% if filter is not string and filter is not mapping %} - {% set filter = dict(filter) %} - {% endif %} - {% if 'columnName' in filter %} - {% do filter.__setitem__('column_name', filter.columnName) %} - {% endif %} - {% if 'comparisonValue' in filter %} - {% do filter.__setitem__('comparison_value', filter.comparisonValue) %} - {% endif %} - {% if filter is not mapping %} - {{ logical_operator.value + ' ' if not loop.first }}{{ filter }} - {% elif filter.operator|upper == 'CONTAINS' %} - {{ logical_operator.value + ' ' if not loop.first }}{{ filter.column_name }} like '%{{ filter.comparison_value }}%' - {% else %} - {{ logical_operator.value + ' ' if not loop.first }}{{ filter.column_name }} {{ filter.operator }} {{ filter.comparison_value }} - {% endif %} - {% endfor %} -) -{% endif %} -{% else %} -true -{% endif %} -{% endmacro %} - - -{% macro combine_filters(filters_a, filters_b, condition) %} -{% set condition = condition if condition is defined else 'AND' %} -{% if filters_a and not filters_b %} -{{ get_filter_statement(filters_a) }} -{% elif filters_b and not filters_a %} -{{ get_filter_statement(filters_b) }} -{% elif not filters_a and not filters_b %} -true -{% else %} -( - {{ get_filter_statement(filters_a)|indent }} - {{ condition }} - {{ get_filter_statement(filters_b)|indent }} -) -{% endif %} -{% endmacro %} diff --git a/rasgotransforms/rasgotransforms/macros/snowflake/aggregate_metrics.sql b/rasgotransforms/rasgotransforms/macros/snowflake/aggregate_metrics.sql index e56b74db..4c4248e2 100644 --- a/rasgotransforms/rasgotransforms/macros/snowflake/aggregate_metrics.sql +++ b/rasgotransforms/rasgotransforms/macros/snowflake/aggregate_metrics.sql @@ -1,4 +1,3 @@ -{% from 'filter.sql' import get_filter_statement %} {% from 'secondary_calculation.sql' import render_secondary_calculations %} {% macro calculate_timeseries_metric_values( diff --git a/rasgotransforms/rasgotransforms/render/environment.py b/rasgotransforms/rasgotransforms/render/environment.py index 03f959fd..276bdc3b 100644 --- a/rasgotransforms/rasgotransforms/render/environment.py +++ b/rasgotransforms/rasgotransforms/render/environment.py @@ -1,9 +1,10 @@ +import functools import re from datetime import datetime, timedelta from itertools import combinations, permutations, product -from typing import Callable, Optional, Dict, Union -from pathlib import Path from os.path import getmtime +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union from jinja2 import Environment, BaseLoader from jinja2.exceptions import TemplateNotFound @@ -12,19 +13,34 @@ from rasgotransforms.exceptions import RenderException from rasgotransforms.main import DataWarehouse +ALLOWED_OPERATORS = ( + ">", + "<", + "=", + "<>", + ">=", + "<=", + "CONTAINS", + "IS NULL", + "IS NOT NULL", + "NOT CONTAINS", + "IN", + "NOT IN", +) + class RasgoEnvironment(Environment): def __init__(self, run_query: Optional[Callable] = None, dw_type: Optional[str] = None, *args, **kwargs): super().__init__(*args, extensions=self.rasgo_extensions, loader=RasgoLoader(), **kwargs) if not dw_type: - dw_type = 'snowflake' + dw_type = "snowflake" self._dw_type = DataWarehouse(dw_type) for filter_name, method in self.rasgo_filters.items(): self.filters[filter_name] = method for name, value in self.rasgo_globals.items(): self.globals[name] = value self._run_query = run_query - self.globals['run_query'] = self._run_query + self.globals["run_query"] = self._run_query @property def dw_type(self) -> DataWarehouse: @@ -36,7 +52,7 @@ def dw_type(self, dw_type: str): @property def rasgo_extensions(self): - return ['jinja2.ext.do', 'jinja2.ext.loopcontrols'] + return ["jinja2.ext.do", "jinja2.ext.loopcontrols"] @property def rasgo_globals(self): @@ -44,6 +60,8 @@ def rasgo_globals(self): "min": min, "max": max, "cleanse_name": cleanse_template_symbol, + "get_filter_statement": functools.partial(get_filter_statement, dw_type=self.dw_type), + "combine_filters": functools.partial(combine_filters, dw_type=self.dw_type), "raise_exception": raise_exception, "itertools": {"combinations": combinations, "permutations": permutations, "product": product}, "dw_type": lambda: self.dw_type.value, @@ -73,57 +91,123 @@ def render( Source columns can is a mapping of table names to columns (e.g. {table_name: {column_name: column_type}}) """ - arguments['source_table'] = source_table + arguments["source_table"] = source_table if not override_globals: override_globals = {} - if source_columns and 'get_columns' not in override_globals: + if source_columns and "get_columns" not in override_globals: def get_columns(fqtn): return source_columns[fqtn] - override_globals['get_columns'] = get_columns - if 'get_columns' in override_globals: - self.globals['get_columns'] = override_globals['get_columns'] + override_globals["get_columns"] = get_columns + if "get_columns" in override_globals: + self.globals["get_columns"] = override_globals["get_columns"] try: template = self.from_string(source_code) rendered = template.render(**arguments, **override_globals) except Exception as e: - raise RenderException(e) + raise RenderException(e) from e return trim_blank_lines(rendered) def cleanse_template_symbol(symbol: str) -> str: - symbol = str(symbol).strip().replace(' ', '_').replace('-', '_') - symbol = re.sub('[^A-Za-z0-9_]+', '', symbol) - symbol = '_' + symbol if not symbol or symbol[0].isdecimal() else symbol + symbol = str(symbol).strip().replace(" ", "_").replace("-", "_") + symbol = re.sub("[^A-Za-z0-9_]+", "", symbol) + symbol = "_" + symbol if not symbol or symbol[0].isdecimal() else symbol return symbol +def combine_filters( + filters_a: Union[List, str], + filters_b: Union[List, str], + condition: str, + dw_type: DataWarehouse, +) -> str: + """ + Parse & combine multiple filters, return a single SQL statement + """ + condition = condition or "AND" + if filters_a and not filters_b: + return get_filter_statement(filters_a, dw_type) + elif filters_b and not filters_a: + return get_filter_statement(filters_b, dw_type) + elif not filters_a and not filters_b: + return "TRUE" + return f"({get_filter_statement(filters_a, dw_type)} {condition} {get_filter_statement(filters_b, dw_type)})" + + +def get_filter_statement( + filters: Union[List, str], + dw_type: DataWarehouse, +) -> str: + """ + Parse a list of string or dict filters to a simple SQL string + """ + if isinstance(filters, str): + return filters + + filter_string = "TRUE" + compound_boolean = "AND" + for fil in filters: + if isinstance(fil, dict): + # Handle variable casing from past versions: only support snake eventually + column_name = fil.get("column_name", fil.get("columnName")) + operator = fil.get("operator", "").upper() + comparison_value = fil.get("comparison_value", fil.get("comparisonValue")) + compound_boolean = fil.get("compound_boolean", fil.get("compoundBoolean", compound_boolean)) + + # Handle override operators + if operator not in ALLOWED_OPERATORS: + raise_exception(f"operator {operator} is not supported") + if operator == "CONTAINS": + operator = "LIKE" + + # Parse complex comparison values + if isinstance(comparison_value, dict): + # Relative Date filter + if comparison_value.get("type", "").upper() == "RELATIVEDATE": + date_part = comparison_value.get("date_part", comparison_value.get("datePart")) + interval = f"{'-' if comparison_value['direction'] == 'past' else ''}{comparison_value['offset']}" + if not (date_part and interval): + raise_exception("relativedate comparisons must pass arguments: date_part, offset, direction") + print(dw_type) + if dw_type is DataWarehouse.BIGQUERY: + comparison_value = f"DATE_ADD(CURRENT_DATE(), INTERVAL {interval} {date_part})" + elif dw_type is DataWarehouse.SNOWFLAKE: + comparison_value = f"DATEADD({date_part}, {interval}, CURRENT_DATE)" + else: + comparison_value = f"(CURRENT_DATE + INTERVAL '{interval} {date_part}')" + filter_string += f" {compound_boolean} {column_name} {operator} {comparison_value} \n" + elif isinstance(fil, str) and fil != "": + filter_string += f" {compound_boolean} {fil} \n" + return filter_string + + def raise_exception(message: str) -> None: raise Exception(message) def trim_blank_lines(sql: str) -> str: - return re.sub(r'[\n][\s]*\n', '\n', sql) + return re.sub(r"[\n][\s]*\n", "\n", sql) def get_timedelta(time_grain: str, interval: int) -> timedelta: time_grain = time_grain.lower() - if time_grain == 'hour': + if time_grain == "hour": return timedelta(hours=interval) - elif time_grain == 'day': + elif time_grain == "day": return timedelta(days=interval) - elif time_grain == 'week': + elif time_grain == "week": return timedelta(weeks=interval) - elif time_grain == 'month': + elif time_grain == "month": interval *= 31 return timedelta(days=interval) - elif time_grain == 'quarter': + elif time_grain == "quarter": interval *= 122 return timedelta(days=interval) - elif time_grain == 'year': + elif time_grain == "year": interval *= 365 return timedelta(days=interval) else: @@ -133,7 +217,7 @@ def get_timedelta(time_grain: str, interval: int) -> timedelta: class RasgoLoader(BaseLoader): def __init__(self, root_path=None): if not root_path: - root_path = Path(__file__).parent.parent / 'macros' + root_path = Path(__file__).parent.parent / "macros" self.root_path = root_path def get_source(self, environment: RasgoEnvironment, template): diff --git a/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql b/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql index 613ecacc..afb80533 100644 --- a/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql +++ b/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql @@ -56,7 +56,7 @@ {{ output }} as {{ output_col_name }} {%- endmacro -%} -{%- macro get_filter_statement(columns) -%} +{%- macro get_where_clause(columns) -%} {%- set filter_statements = [] -%} {%- for column in columns.keys() %} {%- set output_col_name = column if columns[column].name is not defined else cleanse_name(columns[column].name) -%} @@ -68,7 +68,7 @@ {{ "where " if loop.first else "" }}{{ filter_statement }}{{ " \n and " if not loop.last else "" }} {%- endfor %} {%- endmacro -%} -{%- set filter_statement = get_filter_statement(columns) -%} +{%- set filter_statement = get_where_clause(columns) -%} {%- if filter_statement == "" -%} select diff --git a/rasgotransforms/rasgotransforms/transforms/clean/clean.sql b/rasgotransforms/rasgotransforms/transforms/clean/clean.sql index 346f971a..b6bb959f 100644 --- a/rasgotransforms/rasgotransforms/transforms/clean/clean.sql +++ b/rasgotransforms/rasgotransforms/transforms/clean/clean.sql @@ -34,7 +34,7 @@ {{ output }} as {{ output_col_name }} {%- endmacro -%} -{%- macro get_filter_statement(columns) -%} +{%- macro get_where_clause(columns) -%} {%- set filter_statements = [] -%} {%- for column in columns.keys() %} {%- set output_col_name = column if columns[column].name is not defined else cleanse_name(columns[column].name) -%} @@ -46,7 +46,7 @@ {{ "where " if loop.first else "" }}{{ filter_statement }}{{ " \n and " if not loop.last else "" }} {%- endfor %} {%- endmacro -%} -{%- set filter_statement = get_filter_statement(columns) -%} +{%- set filter_statement = get_where_clause(columns) -%} {%- if filter_statement == "" -%} select diff --git a/rasgotransforms/rasgotransforms/transforms/filter/filter.sql b/rasgotransforms/rasgotransforms/transforms/filter/filter.sql index 73450e4a..b1099711 100644 --- a/rasgotransforms/rasgotransforms/transforms/filter/filter.sql +++ b/rasgotransforms/rasgotransforms/transforms/filter/filter.sql @@ -1,4 +1,3 @@ -{% from 'filter.sql' import get_filter_statement %} {% if items is not defined %} {% if filter_statements is not defined %} {{ raise_exception('items is empty: there are no filters to apply') }} diff --git a/rasgotransforms/rasgotransforms/transforms/metric_plot/metric_plot.sql b/rasgotransforms/rasgotransforms/transforms/metric_plot/metric_plot.sql index d13cce20..15efdaf8 100644 --- a/rasgotransforms/rasgotransforms/transforms/metric_plot/metric_plot.sql +++ b/rasgotransforms/rasgotransforms/transforms/metric_plot/metric_plot.sql @@ -2,7 +2,6 @@ {% from 'expression_metrics.sql' import calculate_expression_metric_values %} {% from 'distinct_values.sql' import get_distinct_vals %} {% from 'pivot.sql' import pivot_plot_values %} -{% from 'filter.sql' import get_filter_statement, combine_filters %} {% from 'secondary_calculation.sql' import render_secondary_calculations, adjust_start_date %} {% set dimensions = group_by_dimensions if group_by_dimensions is defined else [] %} {% set max_num_groups = max_num_groups if max_num_groups is defined else 10 %} diff --git a/rasgotransforms/rasgotransforms/transforms/plot/plot.sql b/rasgotransforms/rasgotransforms/transforms/plot/plot.sql index 71169d77..f1779f17 100644 --- a/rasgotransforms/rasgotransforms/transforms/plot/plot.sql +++ b/rasgotransforms/rasgotransforms/transforms/plot/plot.sql @@ -2,7 +2,6 @@ {% from 'expression_metrics.sql' import calculate_expression_metric_values %} {% from 'distinct_values.sql' import get_distinct_vals %} {% from 'pivot.sql' import pivot_plot_values %} -{% from 'filter.sql' import get_filter_statement, combine_filters %} {% from 'secondary_calculation.sql' import render_secondary_calculations, adjust_start_date %} {% set dimensions = dimensions if dimensions is defined else [] %} {% set max_num_groups = max_num_groups if max_num_groups is defined else 10 %} diff --git a/rasgotransforms/rasgotransforms/version.py b/rasgotransforms/rasgotransforms/version.py index 9f4470e1..2d67729f 100644 --- a/rasgotransforms/rasgotransforms/version.py +++ b/rasgotransforms/rasgotransforms/version.py @@ -1,4 +1,4 @@ """ Package version for pypi """ -__version__ = "2.4.1" +__version__ = "2.5.0a1"