|
11 | 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
| 14 | +from __future__ import annotations |
14 | 15 |
|
15 | 16 | """
|
16 | 17 | Utility functions for SQL construction.
|
17 | 18 | """
|
18 | 19 |
|
19 |
| -from typing import Iterable |
| 20 | +import datetime |
| 21 | +import math |
| 22 | +import textwrap |
| 23 | +from typing import Iterable, TYPE_CHECKING |
20 | 24 |
|
| 25 | +# Literals and identifiers matching this pattern can be unquoted |
| 26 | +unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$" |
21 | 27 |
|
22 |
| -def quote(value: str): |
23 |
| - """Return quoted input string.""" |
24 | 28 |
|
25 |
| - # Let's use repr which also escapes any special characters |
26 |
| - # |
27 |
| - # >>> for val in [ |
28 |
| - # ... "123", |
29 |
| - # ... "str with no special chars", |
30 |
| - # ... "str with special chars.,'\"/\\" |
31 |
| - # ... ]: |
32 |
| - # ... print(f"{val} -> {repr(val)}") |
33 |
| - # ... |
34 |
| - # 123 -> '123' |
35 |
| - # str with no special chars -> 'str with no special chars' |
36 |
| - # str with special chars.,'"/\ -> 'str with special chars.,\'"/\\' |
| 29 | +if TYPE_CHECKING: |
| 30 | + import google.cloud.bigquery as bigquery |
37 | 31 |
|
38 |
| - return repr(value) |
| 32 | + import bigframes.core.ordering |
39 | 33 |
|
40 | 34 |
|
41 |
| -def column_reference(column_name: str): |
| 35 | +### Writing SQL Values (literals, column references, table references, etc.) |
| 36 | +def simple_literal(value: str | int | bool | float): |
| 37 | + """Return quoted input string.""" |
| 38 | + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals |
| 39 | + if isinstance(value, str): |
| 40 | + # Single quoting seems to work nicer with ibis than double quoting |
| 41 | + return f"'{escape_special_characters(value)}'" |
| 42 | + elif isinstance(value, (bool, int)): |
| 43 | + return str(value) |
| 44 | + elif isinstance(value, float): |
| 45 | + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals |
| 46 | + if math.isnan(value): |
| 47 | + return 'CAST("nan" as FLOAT)' |
| 48 | + if value == math.inf: |
| 49 | + return 'CAST("+inf" as FLOAT)' |
| 50 | + if value == -math.inf: |
| 51 | + return 'CAST("-inf" as FLOAT)' |
| 52 | + return str(value) |
| 53 | + else: |
| 54 | + raise ValueError(f"Cannot produce literal for {value}") |
| 55 | + |
| 56 | + |
| 57 | +def multi_literal(*values: str): |
| 58 | + literal_strings = [simple_literal(i) for i in values] |
| 59 | + return "(" + ", ".join(literal_strings) + ")" |
| 60 | + |
| 61 | + |
| 62 | +def identifier(id: str) -> str: |
42 | 63 | """Return a string representing column reference in a SQL."""
|
| 64 | + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers |
| 65 | + # Just always escape, otherwise need to check against every reserved sql keyword |
| 66 | + return f"`{escape_special_characters(id)}`" |
| 67 | + |
| 68 | + |
| 69 | +def escape_special_characters(value: str): |
| 70 | + """Escapes all special charactesrs""" |
| 71 | + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals |
| 72 | + trans_table = str.maketrans( |
| 73 | + { |
| 74 | + "\a": r"\a", |
| 75 | + "\b": r"\b", |
| 76 | + "\f": r"\f", |
| 77 | + "\n": r"\n", |
| 78 | + "\r": r"\r", |
| 79 | + "\t": r"\t", |
| 80 | + "\v": r"\v", |
| 81 | + "\\": r"\\", |
| 82 | + "?": r"\?", |
| 83 | + '"': r"\"", |
| 84 | + "'": r"\'", |
| 85 | + "`": r"\`", |
| 86 | + } |
| 87 | + ) |
| 88 | + return value.translate(trans_table) |
| 89 | + |
| 90 | + |
| 91 | +def cast_as_string(column_name: str) -> str: |
| 92 | + """Return a string representing string casting of a column.""" |
43 | 93 |
|
44 |
| - return f"`{column_name}`" |
| 94 | + return f"CAST({identifier(column_name)} AS STRING)" |
45 | 95 |
|
46 | 96 |
|
47 |
| -def cast_as_string(column_name: str): |
48 |
| - """Return a string representing string casting of a column.""" |
| 97 | +def csv(values: Iterable[str]) -> str: |
| 98 | + """Return a string of comma separated values.""" |
| 99 | + return ", ".join(values) |
49 | 100 |
|
50 |
| - return f"CAST({column_reference(column_name)} AS STRING)" |
51 | 101 |
|
| 102 | +def table_reference(table_ref: bigquery.TableReference) -> str: |
| 103 | + return f"`{escape_special_characters(table_ref.project)}`.`{escape_special_characters(table_ref.dataset_id)}`.`{escape_special_characters(table_ref.table_id)}`" |
52 | 104 |
|
53 |
| -def csv(values: Iterable[str], quoted=False): |
54 |
| - """Return a string of comma separated values.""" |
55 | 105 |
|
56 |
| - if quoted: |
57 |
| - values = [quote(val) for val in values] |
| 106 | +def infix_op(opname: str, left_arg: str, right_arg: str): |
| 107 | + # Maybe should add parentheses?? |
| 108 | + return f"{left_arg} {opname} {right_arg}" |
58 | 109 |
|
59 |
| - return ", ".join(values) |
| 110 | + |
| 111 | +### Writing SELECT expressions |
| 112 | +def select_from(columns: Iterable[str], subquery: str, distinct: bool = False): |
| 113 | + selection = ", ".join(map(identifier, columns)) |
| 114 | + distinct_clause = "DISTINCT " if distinct else "" |
| 115 | + |
| 116 | + return textwrap.dedent( |
| 117 | + f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{subquery}\n" ")\n" |
| 118 | + ) |
| 119 | + |
| 120 | + |
| 121 | +def select_table(table_ref: bigquery.TableReference): |
| 122 | + return textwrap.dedent(f"SELECT * FROM {table_reference(table_ref)}") |
| 123 | + |
| 124 | + |
| 125 | +def is_distinct_sql(columns: Iterable[str], table_sql: str) -> str: |
| 126 | + is_unique_sql = f"""WITH full_table AS ( |
| 127 | + {select_from(columns, table_sql)} |
| 128 | + ), |
| 129 | + distinct_table AS ( |
| 130 | + {select_from(columns, table_sql, distinct=True)} |
| 131 | + ) |
| 132 | +
|
| 133 | + SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, |
| 134 | + (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` |
| 135 | + """ |
| 136 | + return is_unique_sql |
| 137 | + |
| 138 | + |
| 139 | +def ordering_clause( |
| 140 | + ordering: Iterable[bigframes.core.ordering.OrderingExpression], |
| 141 | +) -> str: |
| 142 | + import bigframes.core.expression |
| 143 | + |
| 144 | + parts = [] |
| 145 | + for col_ref in ordering: |
| 146 | + asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" |
| 147 | + null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" |
| 148 | + ordering_expr = col_ref.scalar_expression |
| 149 | + # We don't know how to compile scalar expressions in isolation |
| 150 | + if ordering_expr.is_const: |
| 151 | + # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. |
| 152 | + continue |
| 153 | + assert isinstance( |
| 154 | + ordering_expr, bigframes.core.expression.UnboundVariableExpression |
| 155 | + ) |
| 156 | + part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" |
| 157 | + parts.append(part) |
| 158 | + return f"ORDER BY {' ,'.join(parts)}" |
| 159 | + |
| 160 | + |
| 161 | +def snapshot_clause(time_travel_timestamp: datetime.datetime): |
| 162 | + return f"FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())})" |
0 commit comments