diff --git a/CHANGELOG.md b/CHANGELOG.md index 77a6576ee0..c1691e1f64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,29 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.19.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.18.0...v0.19.0) (2024-01-09) + + +### Features + +* Add 'columns' as an alias for 'col_order' ([#298](https://github.com/googleapis/python-bigquery-dataframes/issues/298)) ([a01b271](https://github.com/googleapis/python-bigquery-dataframes/commit/a01b271e76d05459f531cd83c6e93a2d13bfa061)) +* Add Series dt.tz and dt.unit properties ([#303](https://github.com/googleapis/python-bigquery-dataframes/issues/303)) ([2e1a403](https://github.com/googleapis/python-bigquery-dataframes/commit/2e1a4036e58fb6b35aa68ac6d121cb0d04f4f369)) +* Add to_gbq() method for LLM models ([#299](https://github.com/googleapis/python-bigquery-dataframes/issues/299)) ([dafbc1b](https://github.com/googleapis/python-bigquery-dataframes/commit/dafbc1bdb225c7132cdf7191792fde785947c7a1)) +* Allow manually set clustering_columns in dataframe.to_gbq ([#302](https://github.com/googleapis/python-bigquery-dataframes/issues/302)) ([9c21323](https://github.com/googleapis/python-bigquery-dataframes/commit/9c213239a73b5cd0ca7b647a86238263d3947431)) +* Support assigning to columns like a property ([#304](https://github.com/googleapis/python-bigquery-dataframes/issues/304)) ([f645c56](https://github.com/googleapis/python-bigquery-dataframes/commit/f645c56e5436adb100018afbf9ef18003a1a6ed9)) +* Support upcasting numeric columns in concat ([#294](https://github.com/googleapis/python-bigquery-dataframes/issues/294)) ([e3a056a](https://github.com/googleapis/python-bigquery-dataframes/commit/e3a056a301e99c4c3d2a2ecdcbcaf8804be8089f)) + + +### Bug Fixes + +* DF.drop tuple input as multi-index ([#301](https://github.com/googleapis/python-bigquery-dataframes/issues/301)) ([21391a9](https://github.com/googleapis/python-bigquery-dataframes/commit/21391a9d07bb0dc6b6f900f1b069350d6232bd92)) +* Fix bug converting non-string labels to sql ids ([#296](https://github.com/googleapis/python-bigquery-dataframes/issues/296)) ([a61c5fe](https://github.com/googleapis/python-bigquery-dataframes/commit/a61c5fef1e3b88f38269ee5bfd50886b8d2908ae)) + + +### Documentation + +* Add code samples for `Series.ffill` and `DataFrame.ffill` ([#307](https://github.com/googleapis/python-bigquery-dataframes/issues/307)) ([1c63b45](https://github.com/googleapis/python-bigquery-dataframes/commit/1c63b451bb057e5b6470d63d4b44c090d7172aa5)) + ## [0.18.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.17.0...v0.18.0) (2024-01-02) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e8ac8c1d0f..7ff23efad3 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -27,6 +27,7 @@ import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference import bigframes.core.ordering as orderings +import bigframes.core.utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.operations as ops @@ -69,10 +70,14 @@ def from_ibis( @classmethod def from_pandas(cls, pd_df: pandas.DataFrame): iobytes = io.BytesIO() - # Discard row labels and use simple string ids for columns - column_ids = tuple(str(label) for label in pd_df.columns) - pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes) - node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids) + # Use alphanumeric identifiers, to avoid downstream problems with escaping. + as_ids = [ + bigframes.core.utils.label_to_identifier(label, strict=True) + for label in pd_df.columns + ] + unique_ids = tuple(bigframes.core.utils.disambiguate_ids(as_ids)) + pd_df.reset_index(drop=True).set_axis(unique_ids, axis=1).to_feather(iobytes) + node = nodes.ReadLocalNode(iobytes.getvalue()) return cls(node) @property @@ -152,8 +157,8 @@ def project_unary_op( ) -> ArrayValue: """Creates a new expression based on this expression with unary operation applied to one column.""" return ArrayValue( - nodes.ProjectUnaryOpNode( - child=self.node, input_id=column_name, op=op, output_id=output_name + nodes.ProjectRowOpNode( + child=self.node, input_ids=(column_name,), op=op, output_id=output_name ) ) @@ -166,10 +171,9 @@ def project_binary_op( ) -> ArrayValue: """Creates a new expression based on this expression with binary operation applied to two columns.""" return ArrayValue( - nodes.ProjectBinaryOpNode( + nodes.ProjectRowOpNode( child=self.node, - left_input_id=left_column_id, - right_input_id=right_column_id, + input_ids=(left_column_id, right_column_id), op=op, output_id=output_column_id, ) @@ -185,11 +189,9 @@ def project_ternary_op( ) -> ArrayValue: """Creates a new expression based on this expression with ternary operation applied to three columns.""" return ArrayValue( - nodes.ProjectTernaryOpNode( + nodes.ProjectRowOpNode( child=self.node, - input_id1=col_id_1, - input_id2=col_id_2, - input_id3=col_id_3, + input_ids=(col_id_1, col_id_2, col_id_3), op=op, output_id=output_column_id, ) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index c6867c1a33..0b6886562e 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -45,7 +45,7 @@ def equals(block1: blocks.Block, block2: blocks.Block) -> bool: lcolmapped = lmap[lcol] rcolmapped = rmap[rcol] joined_block, result_id = joined_block.apply_binary_op( - lcolmapped, rcolmapped, ops.eq_nulls_match_op + lcolmapped, rcolmapped, ops.eq_null_match_op ) joined_block, result_id = joined_block.apply_unary_op( result_id, ops.partial_right(ops.fillna_op, False) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 779d11b371..9688f439b1 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -581,12 +581,12 @@ def _split( # Create an ordering col and convert to string block, ordering_col = block.promote_offsets() block, string_ordering_col = block.apply_unary_op( - ordering_col, ops.AsTypeOp("string[pyarrow]") + ordering_col, ops.AsTypeOp(to_type="string[pyarrow]") ) # Apply hash method to sum col and order by it. block, string_sum_col = block.apply_binary_op( - string_ordering_col, random_state_col, ops.concat_op + string_ordering_col, random_state_col, ops.strconcat_op ) block, hash_string_sum_col = block.apply_unary_op(string_sum_col, ops.hash_op) block = block.order_by([ordering.OrderingColumnReference(hash_string_sum_col)]) @@ -1232,8 +1232,8 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp("string")) - prefix_op = ops.BinopPartialLeft(ops.add_op, prefix) + expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) + prefix_op = ops.ApplyLeft(base_op=ops.add_op, left_scalar=prefix) expr = expr.project_unary_op(index_col, prefix_op) return Block( expr, @@ -1251,8 +1251,8 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp("string")) - prefix_op = ops.BinopPartialRight(ops.add_op, suffix) + expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) + prefix_op = ops.ApplyRight(base_op=ops.add_op, right_scalar=suffix) expr = expr.project_unary_op(index_col, prefix_op) return Block( expr, @@ -1506,8 +1506,10 @@ def concat( blocks: typing.List[Block] = [self, *other] if ignore_index: blocks = [block.reset_index() for block in blocks] - - result_labels = _align_indices(blocks) + level_names = None + else: + level_names, level_types = _align_indices(blocks) + blocks = [_cast_index(block, level_types) for block in blocks] index_nlevels = blocks[0].index.nlevels @@ -1522,7 +1524,7 @@ def concat( result_expr, index_columns=list(result_expr.column_ids)[:index_nlevels], column_labels=aligned_blocks[0].column_labels, - index_labels=result_labels, + index_labels=level_names, ) if ignore_index: result_block = result_block.reset_index() @@ -1783,16 +1785,40 @@ def block_from_local(data) -> Block: ) +def _cast_index(block: Block, dtypes: typing.Sequence[bigframes.dtypes.Dtype]): + original_block = block + result_ids = [] + for idx_id, idx_dtype, target_dtype in zip( + block.index_columns, block.index_dtypes, dtypes + ): + if idx_dtype != target_dtype: + block, result_id = block.apply_unary_op(idx_id, ops.AsTypeOp(target_dtype)) + result_ids.append(result_id) + else: + result_ids.append(idx_id) + + expr = block.expr.select_columns((*result_ids, *original_block.value_columns)) + return Block( + expr, + index_columns=result_ids, + column_labels=original_block.column_labels, + index_labels=original_block.index_labels, + ) + + def _align_block_to_schema( block: Block, schema: dict[Label, bigframes.dtypes.Dtype] ) -> Block: - """For a given schema, remap block to schema by reordering columns and inserting nulls.""" + """For a given schema, remap block to schema by reordering columns, and inserting nulls.""" col_ids: typing.Tuple[str, ...] = () for label, dtype in schema.items(): - # TODO: Support casting to lcd type - requires mixed type support matching_ids: typing.Sequence[str] = block.label_to_col_id.get(label, ()) if len(matching_ids) > 0: col_id = matching_ids[-1] + col_dtype = block.expr.get_column_type(col_id) + if dtype != col_dtype: + # If _align_schema worked properly, this should always be an upcast + block, col_id = block.apply_unary_op(col_id, ops.AsTypeOp(dtype)) col_ids = (*col_ids, col_id) else: block, null_column = block.create_constant(None, dtype=dtype) @@ -1810,24 +1836,28 @@ def _align_schema( return functools.reduce(reduction, schemas) -def _align_indices(blocks: typing.Sequence[Block]) -> typing.Sequence[Label]: - """Validates that the blocks have compatible indices and returns the resulting label names.""" +def _align_indices( + blocks: typing.Sequence[Block], +) -> typing.Tuple[typing.Sequence[Label], typing.Sequence[bigframes.dtypes.Dtype]]: + """Validates that the blocks have compatible indices and returns the resulting label names and dtypes.""" names = blocks[0].index.names types = blocks[0].index.dtypes + for block in blocks[1:]: if len(names) != block.index.nlevels: raise NotImplementedError( f"Cannot combine indices with different number of levels. Use 'ignore_index'=True. {constants.FEEDBACK_LINK}" ) - if block.index.dtypes != types: - raise NotImplementedError( - f"Cannot combine different index dtypes. Use 'ignore_index'=True. {constants.FEEDBACK_LINK}" - ) names = [ lname if lname == rname else None for lname, rname in zip(names, block.index.names) ] - return names + types = [ + bigframes.dtypes.lcd_type_or_throw(ltype, rtype) + for ltype, rtype in zip(types, block.index.dtypes) + ] + types = typing.cast(typing.Sequence[bigframes.dtypes.Dtype], types) + return names, types def _combine_schema_inner( @@ -1835,13 +1865,15 @@ def _combine_schema_inner( right: typing.Dict[Label, bigframes.dtypes.Dtype], ) -> typing.Dict[Label, bigframes.dtypes.Dtype]: result = dict() - for label, type in left.items(): + for label, left_type in left.items(): if label in right: - if type != right[label]: + right_type = right[label] + output_type = bigframes.dtypes.lcd_type(left_type, right_type) + if output_type is None: raise ValueError( f"Cannot concat rows with label {label} due to mismatched types. {constants.FEEDBACK_LINK}" ) - result[label] = type + result[label] = output_type return result @@ -1850,15 +1882,20 @@ def _combine_schema_outer( right: typing.Dict[Label, bigframes.dtypes.Dtype], ) -> typing.Dict[Label, bigframes.dtypes.Dtype]: result = dict() - for label, type in left.items(): - if (label in right) and (type != right[label]): - raise ValueError( - f"Cannot concat rows with label {label} due to mismatched types. {constants.FEEDBACK_LINK}" - ) - result[label] = type - for label, type in right.items(): + for label, left_type in left.items(): + if label not in right: + result[label] = left_type + else: + right_type = right[label] + output_type = bigframes.dtypes.lcd_type(left_type, right_type) + if output_type is None: + raise NotImplementedError( + f"Cannot concat rows with label {label} due to mismatched types. {constants.FEEDBACK_LINK}" + ) + result[label] = output_type + for label, right_type in right.items(): if label not in left: - result[label] = type + result[label] = right_type return result diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 524699290b..c1e8f1ea48 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -27,6 +27,7 @@ import pandas import bigframes.constants as constants +import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.guid from bigframes.core.ordering import ( encode_order_string, @@ -43,8 +44,11 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" + T = typing.TypeVar("T", bound="BaseIbisIR") +op_compiler = op_compilers.scalar_op_compiler + class BaseIbisIR(abc.ABC): """Implementation detail, contains common logic between ordered and unordered IR""" @@ -147,49 +151,20 @@ def _reproject_to_table(self: T) -> T: """ ... - def project_unary_op( + def project_row_op( self: T, - input_column_id: str, - op: ops.UnaryOp, + input_column_ids: typing.Sequence[str], + op: ops.RowOp, output_column_id: typing.Optional[str] = None, ) -> T: """Creates a new expression based on this expression with unary operation applied to one column.""" result_id = ( - output_column_id or input_column_id + output_column_id or input_column_ids[0] ) # overwrite input if not output id provided - value = op._as_ibis(self._get_ibis_column(input_column_id)).name(result_id) + inputs = tuple(self._get_ibis_column(col) for col in input_column_ids) + value = op_compiler.compile_row_op(op, inputs).name(result_id) return self._set_or_replace_by_id(result_id, value) - def project_binary_op( - self: T, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> T: - """Creates a new expression based on this expression with binary operation applied to two columns.""" - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def project_ternary_op( - self: T, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> T: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - def assign(self: T, source_id: str, destination_id: str) -> T: return self._set_or_replace_by_id( destination_id, self._get_ibis_column(source_id) @@ -454,7 +429,9 @@ def unpivot( None, force_dtype=col_dtype ) ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + op_compiler.compile_row_op( + ops.AsTypeOp(col_dtype), (unpivot_table[col],) + ) if col is not None else null_value for col in source_cols @@ -521,9 +498,7 @@ def aggregate( expr = OrderedIR(result, columns=columns, ordering=ordering) if dropna: for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) + expr = expr._filter(expr._get_ibis_column(column_id).notnull()) # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation return expr._project_offsets() else: @@ -982,7 +957,9 @@ def unpivot( None, force_dtype=col_dtype ) ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + op_compiler.compile_row_op( + ops.AsTypeOp(col_dtype), (unpivot_table[col],) + ) if col is not None else null_value for col in source_cols diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 17dcde638f..c28958a861 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -143,23 +143,9 @@ def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): @_compile_node.register -def compile_project_unary(node: nodes.ProjectUnaryOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_unary_op( - node.input_id, node.op, node.output_id - ) - - -@_compile_node.register -def compile_project_binary(node: nodes.ProjectBinaryOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_binary_op( - node.left_input_id, node.right_input_id, node.op, node.output_id - ) - - -@_compile_node.register -def compile_project_ternary(node: nodes.ProjectTernaryOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_ternary_op( - node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id +def compile_project(node: nodes.ProjectRowOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_row_op( + node.input_ids, node.op, node.output_id ) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py new file mode 100644 index 0000000000..d711dbf456 --- /dev/null +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -0,0 +1,1134 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +import typing + +import ibis +import ibis.common.exceptions +import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.operations.generic +import ibis.expr.types as ibis_types +import numpy as np +import pandas as pd + +import bigframes.constants as constants +import bigframes.dtypes +import bigframes.dtypes as dtypes +import bigframes.operations as ops + +_ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) +_NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) +_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) +_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) + +# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result +# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) +# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. +_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) + + +class ScalarOpCompiler: + # Mapping of operation name to implemenations + _registry: dict[ + str, + typing.Callable[ + [typing.Sequence[ibis_types.Value], ops.RowOp], ibis_types.Value + ], + ] = {} + + def compile_row_op( + self, op: ops.RowOp, inputs: typing.Sequence[ibis_types.Value] + ) -> ibis_types.Value: + impl = self._registry[op.name] + return impl(inputs, op) + + def register_unary_op( + self, + op_ref: typing.Union[ops.UnaryOp, type[ops.UnaryOp]], + pass_op: bool = False, + ): + """ + Decorator to register a unary op implementation. + + Args: + op_ref (UnaryOp or UnaryOp type): + Class or instance of operator that is implemented by the decorated function. + pass_op (bool): + Set to true if implementation takes the operator object as the last argument. + This is needed for parameterized ops where parameters are part of op object. + """ + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + if pass_op: + return impl(args[0], op) + else: + return impl(args[0]) + + self._register(key, normalized_impl) + return impl + + return decorator + + def register_binary_op( + self, + op_ref: typing.Union[ops.BinaryOp, type[ops.BinaryOp]], + pass_op: bool = False, + ): + """ + Decorator to register a binary op implementation. + + Args: + op_ref (BinaryOp or BinaryOp type): + Class or instance of operator that is implemented by the decorated function. + pass_op (bool): + Set to true if implementation takes the operator object as the last argument. + This is needed for parameterized ops where parameters are part of op object. + """ + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + if pass_op: + return impl(args[0], args[1], op) + else: + return impl(args[0], args[1]) + + self._register(key, normalized_impl) + return impl + + return decorator + + def register_ternary_op( + self, op_ref: typing.Union[ops.TernaryOp, type[ops.TernaryOp]] + ): + """ + Decorator to register a ternary op implementation. + + Args: + op_ref (TernaryOp or TernaryOp type): + Class or instance of operator that is implemented by the decorated function. + """ + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + return impl(args[0], args[1], args[2]) + + self._register(key, normalized_impl) + return impl + + return decorator + + def _register( + self, + op_name: str, + impl: typing.Callable[ + [typing.Sequence[ibis_types.Value], ops.RowOp], ibis_types.Value + ], + ): + if op_name in self._registry: + raise ValueError(f"Operation name {op_name} already registered") + self._registry[op_name] = impl + + +# Singleton compiler +scalar_op_compiler = ScalarOpCompiler() + + +### Unary Ops +@scalar_op_compiler.register_unary_op(ops.isnull_op) +def isnull_op_impl(x: ibis_types.Value): + return x.isnull() + + +@scalar_op_compiler.register_unary_op(ops.notnull_op) +def notnull_op_impl(x: ibis_types.Value): + return x.notnull() + + +@scalar_op_compiler.register_unary_op(ops.hash_op) +def hash_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.IntegerValue, x).hash() + + +# Trig Functions +@scalar_op_compiler.register_unary_op(ops.sin_op) +def sin_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).sin() + + +@scalar_op_compiler.register_unary_op(ops.cos_op) +def cos_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).cos() + + +@scalar_op_compiler.register_unary_op(ops.tan_op) +def tan_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).tan() + + +# Inverse trig functions +@scalar_op_compiler.register_unary_op(ops.arcsin_op) +def arcsin_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.asin()) + + +@scalar_op_compiler.register_unary_op(ops.arccos_op) +def arccos_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.acos()) + + +@scalar_op_compiler.register_unary_op(ops.arctan_op) +def arctan_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan() + + +# Hyperbolic trig functions +# BQ has these functions, but Ibis doesn't +@scalar_op_compiler.register_unary_op(ops.sinh_op) +def sinh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sinh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) + + +@scalar_op_compiler.register_unary_op(ops.cosh_op) +def cosh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + cosh_result = (numeric_value.exp() + (numeric_value.negate()).exp()) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, cosh_result) + + +@scalar_op_compiler.register_unary_op(ops.tanh_op) +def tanh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( + numeric_value.exp() + (numeric_value.negate()).exp() + ) + # Beyond +-20, is effectively just the sign function + domain = numeric_value.abs() < _ibis_num(20) + return (~domain).ifelse(numeric_value.sign(), tanh_result) + + +@scalar_op_compiler.register_unary_op(ops.arcsinh_op) +def arcsinh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() + return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() + + +@scalar_op_compiler.register_unary_op(ops.arccosh_op) +def arccosh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() + acosh_result = (numeric_value + sqrt_part).ln() + domain = numeric_value >= _ibis_num(1) + return (~domain).ifelse(_NAN, acosh_result) + + +@scalar_op_compiler.register_unary_op(ops.arctanh_op) +def arctanh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() < _ibis_num(1) + numerator = numeric_value + _ibis_num(1) + denominator = _ibis_num(1) - numeric_value + ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) + atanh_result = ln_input.ln().div(2) + + out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( + _INF * numeric_value, _NAN + ) + + return (~domain).ifelse(out_of_domain, atanh_result) + + +# Numeric Ops +@scalar_op_compiler.register_unary_op(ops.abs_op) +def abs_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).abs() + + +@scalar_op_compiler.register_unary_op(ops.sqrt_op) +def sqrt_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value >= _ZERO + return (~domain).ifelse(_NAN, numeric_value.sqrt()) + + +@scalar_op_compiler.register_unary_op(ops.log10_op) +def log10_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.log10()) + + +@scalar_op_compiler.register_unary_op(ops.ln_op) +def ln_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.ln()) + + +@scalar_op_compiler.register_unary_op(ops.exp_op) +def exp_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, numeric_value.exp()) + + +@scalar_op_compiler.register_unary_op(ops.invert_op) +def invert_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).negate() + + +## String Operation +@scalar_op_compiler.register_unary_op(ops.len_op) +def len_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).length().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.reverse_op) +def reverse_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).reverse() + + +@scalar_op_compiler.register_unary_op(ops.lower_op) +def lower_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).lower() + + +@scalar_op_compiler.register_unary_op(ops.upper_op) +def upper_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).upper() + + +@scalar_op_compiler.register_unary_op(ops.strip_op) +def strip_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).strip() + + +@scalar_op_compiler.register_unary_op(ops.isnumeric_op) +def isnumeric_op_impl(x: ibis_types.Value): + # catches all members of the Unicode number class, which matches pandas isnumeric + # see https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_contains + # TODO: Validate correctness, my miss eg ⅕ character + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN+)$") + + +@scalar_op_compiler.register_unary_op(ops.isalpha_op) +def isalpha_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +@scalar_op_compiler.register_unary_op(ops.isdigit_op) +def isdigit_op_impl(x: ibis_types.Value): + # Based on docs, should include superscript/subscript-ed numbers + # Tests however pass only when set to Nd unicode class + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +@scalar_op_compiler.register_unary_op(ops.isdecimal_op) +def isdecimal_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +@scalar_op_compiler.register_unary_op(ops.isalnum_op) +def isalnum_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{N}|\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +@scalar_op_compiler.register_unary_op(ops.isspace_op) +def isspace_op_impl(x: ibis_types.Value): + # All characters are whitespace characters, False for empty string + return typing.cast(ibis_types.StringValue, x).re_search(r"^\s+$") + + +@scalar_op_compiler.register_unary_op(ops.islower_op) +def islower_op_impl(x: ibis_types.Value): + # No upper case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Ll}") & ~typing.cast( + ibis_types.StringValue, x + ).re_search(r"\p{Lu}|\p{Lt}") + + +@scalar_op_compiler.register_unary_op(ops.isupper_op) +def isupper_op_impl(x: ibis_types.Value): + # No lower case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Lu}") & ~typing.cast( + ibis_types.StringValue, x + ).re_search(r"\p{Ll}|\p{Lt}") + + +@scalar_op_compiler.register_unary_op(ops.rstrip_op) +def rstrip_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).rstrip() + + +@scalar_op_compiler.register_unary_op(ops.lstrip_op) +def lstrip_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).lstrip() + + +@scalar_op_compiler.register_unary_op(ops.capitalize_op) +def capitalize_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).capitalize() + + +@scalar_op_compiler.register_unary_op(ops.StrContainsOp, pass_op=True) +def strcontains_op(x: ibis_types.Value, op: ops.StrContainsOp): + return typing.cast(ibis_types.StringValue, x).contains(op.pat) + + +@scalar_op_compiler.register_unary_op(ops.StrContainsRegexOp, pass_op=True) +def contains_regex_op_impl(x: ibis_types.Value, op: ops.StrContainsRegexOp): + return typing.cast(ibis_types.StringValue, x).re_search(op.pat) + + +@scalar_op_compiler.register_unary_op(ops.StrGetOp, pass_op=True) +def strget_op_impl(x: ibis_types.Value, op: ops.StrGetOp): + substr = typing.cast( + ibis_types.StringValue, typing.cast(ibis_types.StringValue, x)[op.i] + ) + return substr.nullif(ibis_types.literal("")) + + +@scalar_op_compiler.register_unary_op(ops.StrPadOp, pass_op=True) +def strpad_op_impl(x: ibis_types.Value, op: ops.StrPadOp): + str_val = typing.cast(ibis_types.StringValue, x) + + # SQL pad operations will truncate, we do not want to truncate though. + pad_length = ibis.greatest(str_val.length(), op.length) + if op.side == "left": + return str_val.lpad(pad_length, op.fillchar) + elif op.side == "right": + return str_val.rpad(pad_length, op.fillchar) + else: # side == both + # Pad more on right side if can't pad both sides equally + lpad_amount = ((pad_length - str_val.length()) // 2) + str_val.length() + return str_val.lpad(lpad_amount, op.fillchar).rpad(pad_length, op.fillchar) + + +@scalar_op_compiler.register_unary_op(ops.ReplaceStrOp, pass_op=True) +def replacestring_op_impl(x: ibis_types.Value, op: ops.ReplaceStrOp): + pat_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.pat)) + repl_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.repl)) + return typing.cast(ibis_types.StringValue, x).replace(pat_str_value, repl_str_value) + + +@scalar_op_compiler.register_unary_op(ops.RegexReplaceStrOp, pass_op=True) +def replaceregex_op_impl(x: ibis_types.Value, op: ops.RegexReplaceStrOp): + return typing.cast(ibis_types.StringValue, x).re_replace(op.pat, op.repl) + + +@scalar_op_compiler.register_unary_op(ops.StartsWithOp, pass_op=True) +def startswith_op_impl(x: ibis_types.Value, op: ops.StartsWithOp): + any_match = None + for pat in op.pat: + pat_match = typing.cast(ibis_types.StringValue, x).startswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + +@scalar_op_compiler.register_unary_op(ops.EndsWithOp, pass_op=True) +def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp): + any_match = None + for pat in op.pat: + pat_match = typing.cast(ibis_types.StringValue, x).endswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + +@scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True) +def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp): + str_value = typing.cast(ibis_types.StringValue, x) + return ( + ibis.case() + .when( + str_value[0] == "-", + "-" + + strpad_op_impl( + str_value.substr(1), + ops.StrPadOp(length=op.width - 1, fillchar="0", side="left"), + ), + ) + .else_( + strpad_op_impl( + str_value, ops.StrPadOp(length=op.width, fillchar="0", side="left") + ) + ) + .end() + ) + + +@scalar_op_compiler.register_unary_op(ops.StrFindOp, pass_op=True) +def find_op_impl(x: ibis_types.Value, op: ops.StrFindOp): + return typing.cast(ibis_types.StringValue, x).find(op.substr, op.start, op.end) + + +@scalar_op_compiler.register_unary_op(ops.StrExtractOp, pass_op=True) +def extract_op_impl(x: ibis_types.Value, op: ops.StrExtractOp): + return typing.cast(ibis_types.StringValue, x).re_extract(op.pat, op.n) + + +@scalar_op_compiler.register_unary_op(ops.StrSliceOp, pass_op=True) +def slice_op_impl(x: ibis_types.Value, op: ops.StrSliceOp): + return typing.cast(ibis_types.StringValue, x)[op.start : op.end] + + +@scalar_op_compiler.register_unary_op(ops.StrRepeatOp, pass_op=True) +def repeat_op_impl(x: ibis_types.Value, op: ops.StrRepeatOp): + return typing.cast(ibis_types.StringValue, x).repeat(op.repeats) + + +## Datetime Ops +@scalar_op_compiler.register_unary_op(ops.day_op) +def day_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).day().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.date_op) +def date_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).date() + + +@scalar_op_compiler.register_unary_op(ops.dayofweek_op) +def dayofweek_op_impl(x: ibis_types.Value): + return ( + typing.cast(ibis_types.TimestampValue, x) + .day_of_week.index() + .cast(ibis_dtypes.int64) + ) + + +@scalar_op_compiler.register_unary_op(ops.hour_op) +def hour_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).hour().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.minute_op) +def minute_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).minute().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.month_op) +def month_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).month().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.quarter_op) +def quarter_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).quarter().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.second_op) +def second_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).second().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.time_op) +def time_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).time() + + +@scalar_op_compiler.register_unary_op(ops.year_op) +def year_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) + + +# Parameterized ops +@scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) +def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): + struct_value = typing.cast(ibis_types.StructValue, x) + if isinstance(op.name_or_index, str): + name = op.name_or_index + else: + name = struct_value.names[op.name_or_index] + return struct_value[name].name(name) + + +@scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True) +def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): + to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(op.to_type) + if isinstance(x, ibis_types.NullScalar): + return ibis_types.null().cast(to_type) + return bigframes.dtypes.cast_ibis_value(x, to_type) + + +@scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True) +def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): + contains_nulls = any(is_null(value) for value in op.values) + matchable_ibis_values = [] + for item in op.values: + if not is_null(item): + try: + # we want values that *could* be cast to the dtype, but we don't want + # to actually cast it, as that could be lossy (eg float -> int) + item_inferred_type = ibis.literal(item).type() + if ( + x.type() == item_inferred_type + or x.type().is_numeric() + and item_inferred_type.is_numeric() + ): + matchable_ibis_values.append(item) + except TypeError: + pass + + if op.match_nulls and contains_nulls: + return x.isnull() | x.isin(matchable_ibis_values) + else: + return x.isin(matchable_ibis_values) + + +@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) +def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): + if not hasattr(op.func, "bigframes_remote_function"): + raise TypeError( + f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" + ) + x_transformed = op.func(x) + if not op.apply_on_null: + x_transformed = ibis.case().when(x.isnull(), x).else_(x_transformed).end() + return x_transformed + + +@scalar_op_compiler.register_unary_op(ops.MapOp, pass_op=True) +def map_op_impl(x: ibis_types.Value, op: ops.MapOp): + case = ibis.case() + for mapping in op.mappings: + case = case.when(x == mapping[0], mapping[1]) + return case.else_(x).end() + + +### Binary Ops +def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): + """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" + + def short_circuit_nulls_inner(binop): + @functools.wraps(binop) + def wrapped_binop(x: ibis_types.Value, y: ibis_types.Value): + if isinstance(x, ibis_types.NullScalar): + return ibis_types.null().cast(type_override or y.type()) + elif isinstance(y, ibis_types.NullScalar): + return ibis_types.null().cast(type_override or x.type()) + else: + return binop(x, y) + + return wrapped_binop + + return short_circuit_nulls_inner + + +@scalar_op_compiler.register_binary_op(ops.strconcat_op) +def concat_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x_string = typing.cast(ibis_types.StringValue, x) + y_string = typing.cast(ibis_types.StringValue, y) + return x_string.concat(y_string) + + +@scalar_op_compiler.register_binary_op(ops.eq_op) +def eq_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x == y + + +@scalar_op_compiler.register_binary_op(ops.eq_null_match_op) +def eq_nulls_match_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" + left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) + right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) + return left == right + + +@scalar_op_compiler.register_binary_op(ops.ne_op) +def ne_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x != y + + +def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): + return ibis.where( + where_value, + value, + ibis.null(), + ) + + +@scalar_op_compiler.register_binary_op(ops.and_op) +def and_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For AND, when we encounter a + # NULL value, we only know when the result is FALSE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(False)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(False)) + return typing.cast(ibis_types.BooleanValue, x) & typing.cast( + ibis_types.BooleanValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.or_op) +def or_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For OR, when we encounter a + # NULL value, we only know when the result is TRUE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(True)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(True)) + return typing.cast(ibis_types.BooleanValue, x) | typing.cast( + ibis_types.BooleanValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.add_op) +@short_circuit_nulls() +def add_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): + return ibis.null() + try: + # Could be string concatenation or numeric addition. + return x + y # type: ignore + except ibis.common.annotations.SignatureValidationError as exc: + left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) + right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) + raise TypeError( + f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" + ) from exc + + +@scalar_op_compiler.register_binary_op(ops.sub_op) +@short_circuit_nulls() +def sub_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) - typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.mul_op) +@short_circuit_nulls() +def mul_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) * typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.div_op) +@short_circuit_nulls(ibis_dtypes.float) +def div_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) / typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.pow_op) +@short_circuit_nulls(ibis_dtypes.float) +def pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.type().is_integer() and y.type().is_integer(): + return _int_pow_op(x, y) + else: + return _float_pow_op(x, y) + + +@scalar_op_compiler.register_binary_op(ops.unsafe_pow_op) +@short_circuit_nulls(ibis_dtypes.float) +def unsafe_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """For internal use only - where domain and overflow checks are not needed.""" + return typing.cast(ibis_types.NumericValue, x) ** typing.cast( + ibis_types.NumericValue, y + ) + + +def _int_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Need to avoid any error cases - should produce NaN instead + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_as_decimal = typing.cast( + ibis_types.NumericValue, + x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)), + ) + y_val = typing.cast(ibis_types.NumericValue, y) + + # BQ POW() function outputs FLOAT64, which can lose precision. + # Therefore, we do math in NUMERIC and cast back down after. + # Also, explicit bounds checks, pandas will silently overflow. + pow_result = x_as_decimal**y_val + overflow_cond = (pow_result > _ibis_num((2**63) - 1)) | ( + pow_result < _ibis_num(-(2**63)) + ) + + return ( + ibis.case() + .when((overflow_cond), ibis.null()) + .else_(pow_result.cast(ibis_dtypes.int64)) + .end() + ) + + +def _float_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_val = typing.cast(ibis_types.NumericValue, x) + y_val = typing.cast(ibis_types.NumericValue, y) + + overflow_cond = (x_val != _ZERO) & ((y_val * x_val.abs().ln()) > _FLOAT64_EXP_BOUND) + + # Float64 lose integer precision beyond 2**53, beyond this insufficient precision to get parity + exp_too_big = y_val.abs() > _ibis_num(2**53) + # Treat very large exponents as +=INF + norm_exp = exp_too_big.ifelse(_INF * y_val.sign(), y_val) + + pow_result = x_val**norm_exp + + # This cast is dangerous, need to only excuted where y_val has been bounds-checked + # Ibis needs try_cast binding to bq safe_cast + exponent_is_whole = y_val.cast(ibis_dtypes.int64) == y_val + odd_exponent = (x_val < _ZERO) & ( + y_val.cast(ibis_dtypes.int64) % _ibis_num(2) == _ibis_num(1) + ) + infinite_base = x_val.abs() == _INF + + return ( + ibis.case() + # Might be able to do something more clever with x_val==0 case + .when(y_val == _ZERO, _ibis_num(1)) + .when( + x_val == _ibis_num(1), _ibis_num(1) + ) # Need to ignore exponent, even if it is NA + .when( + (x_val == _ZERO) & (y_val < _ZERO), _INF + ) # This case would error POW function in BQ + .when(infinite_base, pow_result) + .when( + exp_too_big, pow_result + ) # Bigquery can actually handle the +-inf cases gracefully + .when((x_val < _ZERO) & (~exponent_is_whole), _NAN) + .when( + overflow_cond, _INF * odd_exponent.ifelse(_ibis_num(-1), _ibis_num(1)) + ) # finite overflows would cause bq to error + .else_(pow_result) + .end() + ) + + +@scalar_op_compiler.register_binary_op(ops.lt_op) +@short_circuit_nulls(ibis_dtypes.bool) +def lt_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x < y + + +@scalar_op_compiler.register_binary_op(ops.le_op) +@short_circuit_nulls(ibis_dtypes.bool) +def le_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x <= y + + +@scalar_op_compiler.register_binary_op(ops.gt_op) +@short_circuit_nulls(ibis_dtypes.bool) +def gt_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x > y + + +@scalar_op_compiler.register_binary_op(ops.ge_op) +@short_circuit_nulls(ibis_dtypes.bool) +def ge_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x >= y + + +@scalar_op_compiler.register_binary_op(ops.floordiv_op) +@short_circuit_nulls(ibis_dtypes.int) +def floordiv_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x_numeric = typing.cast(ibis_types.NumericValue, x) + y_numeric = typing.cast(ibis_types.NumericValue, y) + floordiv_expr = x_numeric // y_numeric + + # DIV(N, 0) will error in bigquery, but needs to return 0 for int, and inf for float in BQ so we short-circuit in this case. + # Multiplying left by zero propogates nulls. + zero_result = _INF if (x.type().is_floating() or y.type().is_floating()) else _ZERO + return ( + ibis.case() + .when(y_numeric == _ZERO, zero_result * x_numeric) + .else_(floordiv_expr) + .end() + ) + + +def _is_float(x: ibis_types.Value): + return isinstance(x, (ibis_types.FloatingColumn, ibis_types.FloatingScalar)) + + +@scalar_op_compiler.register_binary_op(ops.mod_op) +@short_circuit_nulls() +def mod_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + is_result_float = _is_float(x) | _is_float(y) + x_numeric = typing.cast( + ibis_types.NumericValue, + x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) + if is_result_float + else x, + ) + y_numeric = typing.cast( + ibis_types.NumericValue, + y.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) + if is_result_float + else y, + ) + # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. + op = y.op() + if isinstance(op, ibis.expr.operations.generic.Literal) and op.value == 0: + return ibis_types.null().cast(x.type()) + + bq_mod = x_numeric % y_numeric # Bigquery will maintain x sign here + if is_result_float: + bq_mod = typing.cast(ibis_types.NumericValue, bq_mod.cast(ibis_dtypes.float64)) + + # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) + return ( + ibis.case() + .when( + y_numeric == _ZERO, + _NAN * x_numeric if is_result_float else _ZERO * x_numeric, + ) # Dummy op to propogate nulls and type from x arg + .when( + (y_numeric < _ZERO) & (bq_mod > _ZERO), (y_numeric + bq_mod) + ) # Convert positive result to negative + .when( + (y_numeric > _ZERO) & (bq_mod < _ZERO), (y_numeric + bq_mod) + ) # Convert negative result to positive + .else_(bq_mod) + .end() + ) + + +@scalar_op_compiler.register_binary_op(ops.fillna_op) +def fillna_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x.fillna(typing.cast(ibis_types.Scalar, y)) + + +@scalar_op_compiler.register_binary_op(ops.round_op) +def round_op(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).round( + digits=typing.cast(ibis_types.IntegerValue, y) + ) + + +@scalar_op_compiler.register_binary_op(ops.coalesce_op) +def coalesce_impl( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.name("name").equals(y.name("name")): + return x + else: + return ibis.coalesce(x, y) + + +@scalar_op_compiler.register_binary_op(ops.cliplower_op) +def clip_lower( + value: ibis_types.Value, + lower: ibis_types.Value, +): + return ibis.case().when(lower.isnull() | (value < lower), lower).else_(value).end() + + +@scalar_op_compiler.register_binary_op(ops.clipupper_op) +def clip_upper( + value: ibis_types.Value, + upper: ibis_types.Value, +): + return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end() + + +# Ternary Operations +@scalar_op_compiler.register_ternary_op(ops.where_op) +def where_op( + original: ibis_types.Value, + condition: ibis_types.Value, + replacement: ibis_types.Value, +) -> ibis_types.Value: + """Returns x if y is true, otherwise returns z.""" + return ibis.case().when(condition, original).else_(replacement).end() + + +@scalar_op_compiler.register_ternary_op(ops.clip_op) +def clip_op( + original: ibis_types.Value, + lower: ibis_types.Value, + upper: ibis_types.Value, +) -> ibis_types.Value: + """Clips value to lower and upper bounds.""" + if isinstance(lower, ibis_types.NullScalar) and ( + not isinstance(upper, ibis_types.NullScalar) + ): + return ( + ibis.case() + .when(upper.isnull() | (original > upper), upper) + .else_(original) + .end() + ) + elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( + upper, ibis_types.NullScalar + ): + return ( + ibis.case() + .when(lower.isnull() | (original < lower), lower) + .else_(original) + .end() + ) + elif isinstance(lower, ibis_types.NullScalar) and ( + isinstance(upper, ibis_types.NullScalar) + ): + return original + else: + # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound + return ( + ibis.case() + .when(lower.isnull() | (original < lower), lower) + .when(upper.isnull() | (original > upper), upper) + .else_(original) + .end() + ) + + +# Composition Ops +@scalar_op_compiler.register_unary_op(ops.ApplyRight, pass_op=True) +def apply_right(input: ibis_types.Value, op: ops.ApplyRight): + right = dtypes.literal_to_ibis_scalar(op.right_scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (input, right)) + + +@scalar_op_compiler.register_unary_op(ops.ApplyLeft, pass_op=True) +def apply_left(input: ibis_types.Value, op: ops.ApplyLeft): + left = dtypes.literal_to_ibis_scalar(op.left_scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (left, input)) + + +@scalar_op_compiler.register_binary_op(ops.ReverseArgsOp, pass_op=True) +def apply_reversed( + input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ReverseArgsOp +): + return scalar_op_compiler.compile_row_op(op.base_op, (input2, input1)) + + +@scalar_op_compiler.register_binary_op(ops.ApplyArg1, pass_op=True) +def apply_arg1(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg1): + arg1 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (arg1, input1, input2)) + + +@scalar_op_compiler.register_binary_op(ops.ApplyArg3, pass_op=True) +def apply_arg3(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg3): + arg3 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (input1, input2, arg3)) + + +# Helpers +def is_null(value) -> bool: + # float NaN/inf should be treated as distinct from 'true' null values + return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 6fc284403d..f148759f61 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -186,7 +186,7 @@ def astype( ) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") - return self._apply_unary_op(ops.AsTypeOp(dtype)) + return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) def all(self) -> bool: if self.nlevels > 1: @@ -278,7 +278,7 @@ def drop( level_id = self._block.index_columns[0] if utils.is_list_like(labels): block, inverse_condition_id = block.apply_unary_op( - level_id, ops.IsInOp(labels, match_nulls=True) + level_id, ops.IsInOp(values=tuple(labels), match_nulls=True) ) block, condition_id = block.apply_unary_op( inverse_condition_id, ops.invert_op @@ -308,9 +308,9 @@ def isin(self, values) -> Index: f"isin(), you passed a [{type(values).__name__}]" ) - return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( - value=False - ) + return self._apply_unary_op( + ops.IsInOp(values=tuple(values), match_nulls=True) + ).fillna(value=False) def _apply_unary_op( self, diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 556851fa1b..860d394cd2 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -19,12 +19,13 @@ _lock = threading.Lock() MAX_LABELS_COUNT = 64 _api_methods: List = [] +_excluded_methods = ["__setattr__", "__getattr__"] def class_logger(decorated_cls): """Decorator that adds logging functionality to each method of the class.""" for attr_name, attr_value in decorated_cls.__dict__.items(): - if callable(attr_value): + if callable(attr_value) and (attr_name not in _excluded_methods): setattr(decorated_cls, attr_name, method_logger(attr_value, decorated_cls)) return decorated_cls diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 30444f5565..5385852432 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -122,7 +122,6 @@ def __hash__(self): @dataclass(frozen=True) class ReadLocalNode(BigFrameNode): feather_bytes: bytes - column_ids: typing.Tuple[str, ...] def __hash__(self): return self._node_hash @@ -197,38 +196,15 @@ def __hash__(self): @dataclass(frozen=True) -class ProjectUnaryOpNode(UnaryNode): - input_id: str - op: ops.UnaryOp +class ProjectRowOpNode(UnaryNode): + input_ids: typing.Tuple[str, ...] + op: ops.RowOp output_id: Optional[str] = None def __hash__(self): return self._node_hash -@dataclass(frozen=True) -class ProjectBinaryOpNode(UnaryNode): - left_input_id: str - right_input_id: str - op: ops.BinaryOp - output_id: str - - def __hash__(self): - return self._node_hash - - -@dataclass(frozen=True) -class ProjectTernaryOpNode(UnaryNode): - input_id1: str - input_id2: str - input_id3: str - op: ops.TernaryOp - output_id: str - - def __hash__(self): - return self._node_hash - - # TODO: Merge RowCount and Corr into Aggregate Node @dataclass(frozen=True) class RowCountNode(UnaryNode): diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index dc7c709011..4331999dd6 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re import typing from typing import Hashable, Iterable, List @@ -84,26 +85,42 @@ def get_standardized_ids( Tuple of (standardized_column_ids, standardized_index_ids) """ col_ids = [ - UNNAMED_COLUMN_ID if col_label is None else str(col_label) + UNNAMED_COLUMN_ID if col_label is None else label_to_identifier(col_label) for col_label in col_labels ] idx_ids = [ - UNNAMED_INDEX_ID if idx_label is None else str(idx_label) + UNNAMED_INDEX_ID if idx_label is None else label_to_identifier(idx_label) for idx_label in idx_labels ] - ids = idx_ids + col_ids + ids = disambiguate_ids(idx_ids + col_ids) + + idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :] + + return col_ids, idx_ids + + +def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str: + """ + Convert pandas label to make legal bigquery identifier. May create collisions (should deduplicate after). + Strict mode might not be necessary, but ibis seems to escape non-alphanumeric characters inconsistently. + """ # Column values will be loaded as null if the column name has spaces. # https://github.com/googleapis/python-bigquery/issues/1566 - ids = [id.replace(" ", "_") for id in ids] + identifier = str(label).replace(" ", "_") + if strict: + identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier) + if not identifier: + identifier = "id" + return identifier + - ids = typing.cast( +def disambiguate_ids(ids: typing.Sequence[str]) -> typing.List[str]: + """Disambiguate list of ids by adding suffixes where needed. If inputs are legal sql ids, outputs should be as well.""" + return typing.cast( List[str], vendored_pandas_io_common.dedup_names(ids, is_potential_multiindex=False), ) - idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :] - - return col_ids, idx_ids def merge_column_labels( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 595670b0b6..1f039904f0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -154,7 +154,7 @@ def __init__( block = block.select_columns(list(columns)) # type:ignore if dtype: block = block.multi_apply_unary_op( - block.value_columns, ops.AsTypeOp(dtype) + block.value_columns, ops.AsTypeOp(to_type=dtype) ) self._block = block @@ -251,6 +251,15 @@ def index( ) -> indexes.Index: return indexes.Index(self) + @index.setter + def index(self, value): + # TODO: Handle assigning MultiIndex + result = self._assign_single_item("_new_bf_index", value).set_index( + "_new_bf_index" + ) + self._set_block(result._get_block()) + self.index.name = value.name if hasattr(value, "name") else None + @property def loc(self) -> indexers.LocDataFrameIndexer: return indexers.LocDataFrameIndexer(self) @@ -316,7 +325,7 @@ def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], ) -> DataFrame: - return self._apply_unary_op(ops.AsTypeOp(dtype)) + return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) def _to_sql_query( self, include_index: bool @@ -545,6 +554,29 @@ def __getattr__(self, key: str): else: raise AttributeError(key) + def __setattr__(self, key: str, value): + if key in ["_block", "_query_job"]: + object.__setattr__(self, key, value) + return + # Can this be removed??? + try: + # boring attributes go through boring old path + object.__getattribute__(self, key) + return object.__setattr__(self, key, value) + except AttributeError: + pass + + # if this fails, go on to more involved attribute setting + # (note that this matches __getattr__, above). + try: + if key in self.columns: + self[key] = value + else: + object.__setattr__(self, key, value) + # Can this be removed? + except (AttributeError, TypeError): + object.__setattr__(self, key, value) + def __repr__(self) -> str: """Converts a DataFrame to a string. Calls to_pandas. @@ -638,7 +670,7 @@ def _apply_binop( def _apply_scalar_binop(self, other: float | int, op: ops.BinaryOp) -> DataFrame: block = self._block - partial_op = ops.BinopPartialRight(op, other) + partial_op = ops.ApplyRight(base_op=op, right_scalar=other) for column_id, label in zip( self._block.value_columns, self._block.column_labels ): @@ -1062,12 +1094,31 @@ def drop( level_id = self._resolve_levels(level or 0)[0] if utils.is_list_like(index): - block, inverse_condition_id = block.apply_unary_op( - level_id, ops.IsInOp(index, match_nulls=True) - ) - block, condition_id = block.apply_unary_op( - inverse_condition_id, ops.invert_op - ) + # Only tuple is treated as multi-index value combinations + if isinstance(index, tuple): + if level is not None: + raise ValueError("Multi-index tuple can't specify level.") + condition_id = None + for i, idx in enumerate(index): + level_id = self._resolve_levels(i)[0] + block, condition_id_cur = block.apply_unary_op( + level_id, ops.partial_right(ops.ne_op, idx) + ) + if condition_id: + block, condition_id = block.apply_binary_op( + condition_id, condition_id_cur, ops.or_op + ) + else: + condition_id = condition_id_cur + + condition_id = typing.cast(str, condition_id) + else: + block, inverse_condition_id = block.apply_unary_op( + level_id, ops.IsInOp(values=tuple(index), match_nulls=True) + ) + block, condition_id = block.apply_unary_op( + inverse_condition_id, ops.invert_op + ) elif isinstance(index, indexes.Index): return self._drop_by_index(index) else: @@ -1246,6 +1297,15 @@ def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: [get_column_left[col_id] for col_id in original_index_column_ids], index_labels=self._block.index_labels, ) + src_col = get_column_right[new_column_block.value_columns[0]] + # Check to see if key exists, and modify in place + col_ids = self._block.cols_matching_label(k) + for col_id in col_ids: + result_block = result_block.copy_values( + src_col, get_column_left[col_id] + ) + if len(col_ids) > 0: + result_block = result_block.drop_columns([src_col]) return DataFrame(result_block) def _assign_scalar(self, label: str, value: Union[int, float]) -> DataFrame: @@ -1423,16 +1483,16 @@ def _filter_rows( block = self._block block, label_string_id = block.apply_unary_op( self._block.index_columns[0], - ops.AsTypeOp(pandas.StringDtype(storage="pyarrow")), + ops.AsTypeOp(to_type=pandas.StringDtype(storage="pyarrow")), ) if like is not None: block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsStringOp(pat=like) + label_string_id, ops.StrContainsOp(pat=like) ) else: # regex assert regex is not None block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsRegexOp(pat=regex) + label_string_id, ops.StrContainsRegexOp(pat=regex) ) block = block.filter(mask_id) @@ -1442,7 +1502,7 @@ def _filter_rows( # Behavior matches pandas 2.1+, older pandas versions would reindex block = self._block block, mask_id = block.apply_unary_op( - self._block.index_columns[0], ops.IsInOp(values=list(items)) + self._block.index_columns[0], ops.IsInOp(values=tuple(items)) ) block = block.filter(mask_id) block = block.select_columns(self._block.value_columns) @@ -1593,7 +1653,9 @@ def isin(self, values) -> DataFrame: if label in values.keys(): value_for_key = values[label] block, result_id = block.apply_unary_op( - col, ops.IsInOp(value_for_key, match_nulls=True), label + col, + ops.IsInOp(values=tuple(value_for_key), match_nulls=True), + label, ) result_ids.append(result_id) else: @@ -1603,9 +1665,9 @@ def isin(self, values) -> DataFrame: result_ids.append(result_id) return DataFrame(block.select_columns(result_ids)).fillna(value=False) elif utils.is_list_like(values): - return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( - value=False - ) + return self._apply_unary_op( + ops.IsInOp(values=tuple(values), match_nulls=True) + ).fillna(value=False) else: raise TypeError( "only list-like objects are allowed to be passed to " @@ -2499,6 +2561,7 @@ def to_gbq( if_exists: Optional[Literal["fail", "replace", "append"]] = None, index: bool = True, ordering_id: Optional[str] = None, + clustering_columns: Union[pandas.Index, Iterable[typing.Hashable]] = (), ) -> str: dispositions = { "fail": bigquery.WriteDisposition.WRITE_EMPTY, @@ -2506,18 +2569,9 @@ def to_gbq( "append": bigquery.WriteDisposition.WRITE_APPEND, } - if destination_table is None: - # TODO(swast): If there have been no modifications to the DataFrame - # since the last time it was written (cached), then return that. - # For `read_gbq` nodes, return the underlying table clone. - destination_table = bigframes.session._io.bigquery.create_temp_table( - self._session.bqclient, - self._session._anonymous_dataset, - # TODO(swast): allow custom expiration times, probably via session configuration. - datetime.datetime.now(datetime.timezone.utc) - + constants.DEFAULT_EXPIRATION, - ) + temp_table_ref = None + if destination_table is None: if if_exists is not None and if_exists != "replace": raise ValueError( f"Got invalid value {repr(if_exists)} for if_exists. " @@ -2526,6 +2580,11 @@ def to_gbq( ) if_exists = "replace" + temp_table_ref = bigframes.session._io.bigquery.random_table( + self._session._anonymous_dataset + ) + destination_table = f"{temp_table_ref.project}.{temp_table_ref.dataset_id}.{temp_table_ref.table_id}" + table_parts = destination_table.split(".") default_project = self._block.expr.session.bqclient.project @@ -2553,15 +2612,29 @@ def to_gbq( except google.api_core.exceptions.NotFound: self._session.bqclient.create_dataset(destination_dataset, exists_ok=True) + clustering_fields = self._map_clustering_columns( + clustering_columns, index=index + ) + job_config = bigquery.QueryJobConfig( write_disposition=dispositions[if_exists], destination=bigquery.table.TableReference.from_string( destination_table, default_project=default_project, ), + clustering_fields=clustering_fields if clustering_fields else None, ) self._run_io_query(index=index, ordering_id=ordering_id, job_config=job_config) + + if temp_table_ref: + bigframes.session._io.bigquery.set_table_expiration( + self._session.bqclient, + temp_table_ref, + datetime.datetime.now(datetime.timezone.utc) + + constants.DEFAULT_EXPIRATION, + ) + return destination_table def to_numpy( @@ -2756,6 +2829,52 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: block = self._block.multi_apply_unary_op(self._block.value_columns, operation) return DataFrame(block) + def _map_clustering_columns( + self, + clustering_columns: Union[pandas.Index, Iterable[typing.Hashable]], + index: bool, + ) -> List[str]: + """Maps the provided clustering columns to the existing columns in the DataFrame.""" + + def map_columns_on_occurrence(columns): + mapped_columns = [] + for col in clustering_columns: + if col in columns: + count = columns.count(col) + mapped_columns.extend([col] * count) + return mapped_columns + + if not clustering_columns: + return [] + + if len(list(clustering_columns)) != len(set(clustering_columns)): + raise ValueError("Duplicates are not supported in clustering_columns") + + all_possible_columns = ( + (set(self.columns) | set(self.index.names)) if index else set(self.columns) + ) + missing_columns = set(clustering_columns) - all_possible_columns + if missing_columns: + raise ValueError( + f"Clustering columns not found in DataFrame: {missing_columns}" + ) + + clustering_columns_for_df = map_columns_on_occurrence( + list(self._block.column_labels) + ) + clustering_columns_for_index = ( + map_columns_on_occurrence(list(self.index.names)) if index else [] + ) + + ( + clustering_columns_for_df, + clustering_columns_for_index, + ) = utils.get_standardized_ids( + clustering_columns_for_df, clustering_columns_for_index + ) + + return clustering_columns_for_index + clustering_columns_for_df + def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: """Create query text representing this dataframe for I/O.""" array_value = self._block.expr @@ -2824,7 +2943,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: # inputs causing errors. reprojected_df = DataFrame(self._block._force_reproject()) return reprojected_df._apply_unary_op( - ops.RemoteFunctionOp(func, apply_on_null=(na_action is None)) + ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None)) ) def apply(self, func, *, args: typing.Tuple = (), **kwargs): diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index b754acea2e..608885dec4 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -40,6 +40,7 @@ pd.Int64Dtype, pd.StringDtype, pd.ArrowDtype, + gpd.array.GeometryDtype, ] # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable @@ -139,7 +140,7 @@ ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} -IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Union[Dtype, np.dtype[Any]]] = { +IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Dtype] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } # Allow REQUIRED fields to map correctly. @@ -179,7 +180,7 @@ def ibis_dtype_to_bigframes_dtype( ibis_dtype: ibis_dtypes.DataType, -) -> Union[Dtype, np.dtype[Any]]: +) -> Dtype: """Converts an Ibis dtype to a BigQuery DataFrames dtype Args: @@ -340,6 +341,11 @@ def literal_to_ibis_scalar( ValueError: if passed literal cannot be coerced to a BigQuery DataFrames compatible scalar """ + # Special case: Can create nulls for non-bidirectional types + if (force_dtype == gpd.array.GeometryDtype()) and pd.isna(literal): + # Ibis has bug for casting nulltype to geospatial, so we perform intermediate cast first + geotype = ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True) + return ibis.literal(None, geotype) ibis_dtype = BIGFRAMES_TO_IBIS[force_dtype] if force_dtype else None if pd.api.types.is_list_like(literal): @@ -538,6 +544,8 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]: + if dtype1 == dtype2: + return dtype1 # Implicit conversion currently only supported for numeric types hierarchy: list[Dtype] = [ pd.BooleanDtype(), @@ -550,3 +558,12 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]: return None lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2)) return hierarchy[lcd_index] + + +def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: + result = lcd_type(dtype1, dtype2) + if result is None: + raise NotImplementedError( + f"BigFrames cannot upcast {dtype1} and {dtype2} to common type. {constants.FEEDBACK_LINK}" + ) + return result diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 5beb54a32d..8c01159113 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -19,6 +19,8 @@ from typing import cast, Literal, Optional, Union import warnings +from google.cloud import bigquery + import bigframes from bigframes import clients, constants from bigframes.core import blocks, log_adapter @@ -113,6 +115,26 @@ def _create_bqml_model(self): session=self.session, connection_name=self.connection_name, options=options ) + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> PaLM2TextGenerator: + assert model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in model._properties + assert "endpoint" in model._properties["remoteModelInfo"] + assert "connection" in model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + bqml_endpoint = model._properties["remoteModelInfo"]["endpoint"] + model_connection = model._properties["remoteModelInfo"]["connection"] + model_endpoint = bqml_endpoint.split("/")[-1] + + text_generator_model = cls( + session=session, model_name=model_endpoint, connection_name=model_connection + ) + text_generator_model._bqml_model = core.BqmlModel(session, model) + return text_generator_model + def predict( self, X: Union[bpd.DataFrame, bpd.Series], @@ -200,6 +222,21 @@ def predict( return df + def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + PaLM2TextGenerator: saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + @log_adapter.class_logger class PaLM2TextEmbeddingGenerator(base.Predictor): @@ -271,6 +308,26 @@ def _create_bqml_model(self): session=self.session, connection_name=self.connection_name, options=options ) + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> PaLM2TextEmbeddingGenerator: + assert model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in model._properties + assert "endpoint" in model._properties["remoteModelInfo"] + assert "connection" in model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + bqml_endpoint = model._properties["remoteModelInfo"]["endpoint"] + model_connection = model._properties["remoteModelInfo"]["connection"] + model_endpoint = bqml_endpoint.split("/")[-1] + + embedding_generator_model = cls( + session=session, model_name=model_endpoint, connection_name=model_connection + ) + embedding_generator_model._bqml_model = core.BqmlModel(session, model) + return embedding_generator_model + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. @@ -307,3 +364,20 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) return df + + def to_gbq( + self, model_name: str, replace: bool = False + ) -> PaLM2TextEmbeddingGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + PaLM2TextEmbeddingGenerator: saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 805747c49b..4ffde43543 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -28,6 +28,7 @@ forecasting, imported, linear_model, + llm, pipeline, ) @@ -47,6 +48,15 @@ } ) +_BQML_ENDPOINT_TYPE_MAPPING = MappingProxyType( + { + llm._TEXT_GENERATOR_BISON_ENDPOINT: llm.PaLM2TextGenerator, + llm._TEXT_GENERATOR_BISON_32K_ENDPOINT: llm.PaLM2TextGenerator, + llm._EMBEDDING_GENERATOR_GECKO_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, + llm._EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT: llm.PaLM2TextEmbeddingGenerator, + } +) + def from_bq( session: bigframes.Session, bq_model: bigquery.Model @@ -62,6 +72,8 @@ def from_bq( ensemble.RandomForestClassifier, imported.TensorFlowModel, imported.ONNXModel, + llm.PaLM2TextGenerator, + llm.PaLM2TextEmbeddingGenerator, pipeline.Pipeline, ]: """Load a BQML model to BigQuery DataFrames ML. @@ -84,6 +96,17 @@ def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): return _BQML_MODEL_TYPE_MAPPING[bq_model.model_type]._from_bq( # type: ignore session=session, model=bq_model ) + if ( + bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + and "remoteModelInfo" in bq_model._properties + and "endpoint" in bq_model._properties["remoteModelInfo"] + ): + # Parse the remote model endpoint + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + endpoint_model = bqml_endpoint.split("/")[-1] + return _BQML_ENDPOINT_TYPE_MAPPING[endpoint_model]._from_bq( # type: ignore + session=session, model=bq_model + ) raise NotImplementedError( f"Model type {bq_model.model_type} is not yet supported by BigQuery DataFrames. {constants.FEEDBACK_LINK}" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 678774978a..3ef551e453 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -14,1083 +14,379 @@ from __future__ import annotations -import functools +import dataclasses import typing -import ibis -import ibis.common.annotations -import ibis.common.exceptions -import ibis.expr.datatypes as ibis_dtypes -import ibis.expr.operations.generic -import ibis.expr.types as ibis_types import numpy as np -import pandas as pd -import bigframes.constants as constants -import bigframes.dtypes import bigframes.dtypes as dtypes -_ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) -_NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) -_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) -_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) -# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result -# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) -# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. -_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) -_INT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(43.6)) +class RowOp(typing.Protocol): + @property + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") -BinaryOp = typing.Callable[[ibis_types.Value, ibis_types.Value], ibis_types.Value] -TernaryOp = typing.Callable[ - [ibis_types.Value, ibis_types.Value, ibis_types.Value], ibis_types.Value -] + @property + def arguments(self) -> int: + """The number of column argument the operation takes""" + raise NotImplementedError("RowOp abstract base class has no implementation") -### Unary Ops +# These classes can be used to create simple ops that don't take local parameters +# All is needed is a unique name, and to register an implementation in ibis_mappings.py +@dataclasses.dataclass(frozen=True) class UnaryOp: - def _as_ibis(self, x): - raise NotImplementedError( - f"Base class UnaryOp has no implementation. {constants.FEEDBACK_LINK}" - ) - @property - def is_windowed(self): - return False - - -# Trig Functions -class AbsOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).abs() - - -class SinOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).sin() - - -class CosOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).cos() - - -class TanOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).tan() - - -# Inverse trig functions -class ArcsinOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() <= _ibis_num(1) - return (~domain).ifelse(_NAN, numeric_value.asin()) - - -class ArccosOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() <= _ibis_num(1) - return (~domain).ifelse(_NAN, numeric_value.acos()) - - -class ArctanOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).atan() - - -# Hyperbolic trig functions -# BQ has these functions, but Ibis doesn't -class SinhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sinh_result = ( - numeric_value.exp() - (numeric_value.negate()).exp() - ) / _ibis_num(2) - domain = numeric_value.abs() < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) - - -class CoshOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - cosh_result = ( - numeric_value.exp() + (numeric_value.negate()).exp() - ) / _ibis_num(2) - domain = numeric_value.abs() < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF, cosh_result) - - -class TanhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( - numeric_value.exp() + (numeric_value.negate()).exp() - ) - # Beyond +-20, is effectively just the sign function - domain = numeric_value.abs() < _ibis_num(20) - return (~domain).ifelse(numeric_value.sign(), tanh_result) - - -class ArcsinhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() - return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() - - -class ArccoshOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() - acosh_result = (numeric_value + sqrt_part).ln() - domain = numeric_value >= _ibis_num(1) - return (~domain).ifelse(_NAN, acosh_result) - - -class ArctanhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() < _ibis_num(1) - numerator = numeric_value + _ibis_num(1) - denominator = _ibis_num(1) - numeric_value - ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) - atanh_result = ln_input.ln().div(2) - - out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( - _INF * numeric_value, _NAN - ) - - return (~domain).ifelse(out_of_domain, atanh_result) - - -class SqrtOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value >= _ZERO - return (~domain).ifelse(_NAN, numeric_value.sqrt()) - - -class Log10Op(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value > _ZERO - out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) - return (~domain).ifelse(out_of_domain, numeric_value.log10()) - - -class LnOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value > _ZERO - out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) - return (~domain).ifelse(out_of_domain, numeric_value.ln()) - - -class ExpOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF, numeric_value.exp()) - - -class InvertOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).negate() - - -class IsNullOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return x.isnull() - - -class LenOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).length().cast(ibis_dtypes.int64) - - -class NotNullOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return x.notnull() - - -class HashOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.IntegerValue, x).hash() - - -## String Operation -class ReverseOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).reverse() - - -class LowerOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).lower() - - -class UpperOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).upper() - - -class StripOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).strip() - - -class IsNumericOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # catches all members of the Unicode number class, which matches pandas isnumeric - # see https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_contains - # TODO: Validate correctness, my miss eg ⅕ character - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN+)$") - - -class IsAlphaOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search( - r"^(\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" - ) - - -class IsDigitOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # Based on docs, should include superscript/subscript-ed numbers - # Tests however pass only when set to Nd unicode class - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") - - -class IsDecimalOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") - - -class IsAlnumOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search( - r"^(\p{N}|\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" - ) - - -class IsSpaceOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # All characters are whitespace characters, False for empty string - return typing.cast(ibis_types.StringValue, x).re_search(r"^\s+$") - - -class IsLowerOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # No upper case characters, min one cased character - # See: https://docs.python.org/3/library/stdtypes.html#str - return typing.cast(ibis_types.StringValue, x).re_search( - r"\p{Ll}" - ) & ~typing.cast(ibis_types.StringValue, x).re_search(r"\p{Lu}|\p{Lt}") - - -class IsUpperOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # No lower case characters, min one cased character - # See: https://docs.python.org/3/library/stdtypes.html#str - return typing.cast(ibis_types.StringValue, x).re_search( - r"\p{Lu}" - ) & ~typing.cast(ibis_types.StringValue, x).re_search(r"\p{Ll}|\p{Lt}") - - -class RstripOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).rstrip() - - -class LstripOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).lstrip() + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") + @property + def arguments(self) -> int: + return 1 -class CapitalizeOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).capitalize() +@dataclasses.dataclass(frozen=True) +class BinaryOp: + @property + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") -class ContainsStringOp(UnaryOp): - def __init__(self, pat: str, case: bool = True): - self._pat = pat + @property + def arguments(self) -> int: + return 2 - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).contains(self._pat) +@dataclasses.dataclass(frozen=True) +class TernaryOp: + @property + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") -class ContainsRegexOp(UnaryOp): - def __init__(self, pat: str): - self._pat = pat + @property + def arguments(self) -> int: + return 3 + + +# Operation Factories +def create_unary_op(name: str) -> UnaryOp: + return dataclasses.make_dataclass( + name, + [("name", typing.ClassVar[str], name)], # type: ignore + bases=(UnaryOp,), + frozen=True, + )() + + +def create_binary_op(name: str) -> BinaryOp: + return dataclasses.make_dataclass( + name, + [("name", typing.ClassVar[str], name)], # type: ignore + bases=(BinaryOp,), + frozen=True, + )() + + +def create_ternary_op(name: str) -> TernaryOp: + return dataclasses.make_dataclass( + name, + [("name", typing.ClassVar[str], name)], # type: ignore + bases=(TernaryOp,), + frozen=True, + )() + + +# Unary Ops +## Generic Ops +invert_op = create_unary_op(name="invert") +isnull_op = create_unary_op(name="isnull") +notnull_op = create_unary_op(name="notnull") +hash_op = create_unary_op(name="hash") +## String Ops +len_op = create_unary_op(name="len") +reverse_op = create_unary_op(name="reverse") +lower_op = create_unary_op(name="lower") +upper_op = create_unary_op(name="upper") +strip_op = create_unary_op(name="strip") +isalnum_op = create_unary_op(name="isalnum") +isalpha_op = create_unary_op(name="isalpha") +isdecimal_op = create_unary_op(name="isdecimal") +isdigit_op = create_unary_op(name="isdigit") +isnumeric_op = create_unary_op(name="isnumeric") +isspace_op = create_unary_op(name="isspace") +islower_op = create_unary_op(name="islower") +isupper_op = create_unary_op(name="isupper") +rstrip_op = create_unary_op(name="rstrip") +lstrip_op = create_unary_op(name="lstrip") +capitalize_op = create_unary_op(name="capitalize") +## DateTime Ops +day_op = create_unary_op(name="day") +dayofweek_op = create_unary_op(name="dayofweek") +date_op = create_unary_op(name="date") +hour_op = create_unary_op(name="hour") +minute_op = create_unary_op(name="minute") +month_op = create_unary_op(name="month") +quarter_op = create_unary_op(name="quarter") +second_op = create_unary_op(name="second") +time_op = create_unary_op(name="time") +year_op = create_unary_op(name="year") +## Trigonometry Ops +sin_op = create_unary_op(name="sin") +cos_op = create_unary_op(name="cos") +tan_op = create_unary_op(name="tan") +arcsin_op = create_unary_op(name="arcsin") +arccos_op = create_unary_op(name="arccos") +arctan_op = create_unary_op(name="arctan") +sinh_op = create_unary_op(name="sinh") +cosh_op = create_unary_op(name="cosh") +tanh_op = create_unary_op(name="tanh") +arcsinh_op = create_unary_op(name="arcsinh") +arccosh_op = create_unary_op(name="arccosh") +arctanh_op = create_unary_op(name="arctanh") +## Numeric Ops +abs_op = create_unary_op(name="abs") +exp_op = create_unary_op(name="exp") +ln_op = create_unary_op(name="log") +log10_op = create_unary_op(name="log10") +sqrt_op = create_unary_op(name="sqrt") + + +# Parameterized unary ops +@dataclasses.dataclass(frozen=True) +class StrContainsOp(UnaryOp): + name: typing.ClassVar[str] = "str_contains" + pat: str + + +@dataclasses.dataclass(frozen=True) +class StrContainsRegexOp(UnaryOp): + name: typing.ClassVar[str] = "str_contains_regex" + pat: str + + +@dataclasses.dataclass(frozen=True) +class StrGetOp(UnaryOp): + name: typing.ClassVar[str] = "str_get" + i: int - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search(self._pat) +@dataclasses.dataclass(frozen=True) +class StrPadOp(UnaryOp): + name: typing.ClassVar[str] = "str_pad" + length: int + fillchar: str + side: typing.Literal["both", "left", "right"] -class StrGetOp(UnaryOp): - def __init__(self, i: int): - self._i = i - def _as_ibis(self, x: ibis_types.Value): - substr = typing.cast( - ibis_types.StringValue, typing.cast(ibis_types.StringValue, x)[self._i] - ) - return substr.nullif(ibis_types.literal("")) +@dataclasses.dataclass(frozen=True) +class ReplaceStrOp(UnaryOp): + name: typing.ClassVar[str] = "str_replace" + pat: str + repl: str -class StrPadOp(UnaryOp): - def __init__( - self, length: int, fillchar: str, side: typing.Literal["both", "left", "right"] - ): - self._length = length - self._fillchar = fillchar - self._side = side - - def _as_ibis(self, x: ibis_types.Value): - str_val = typing.cast(ibis_types.StringValue, x) - - # SQL pad operations will truncate, we do not want to truncate though. - pad_length = typing.cast( - ibis_types.IntegerValue, ibis.greatest(str_val.length(), self._length) - ) - if self._side == "left": - return str_val.lpad(pad_length, self._fillchar) - elif self._side == "right": - return str_val.rpad(pad_length, self._fillchar) - else: # side == both - # Pad more on right side if can't pad both sides equally - lpad_amount = typing.cast( - ibis_types.IntegerValue, - ( - (pad_length - str_val.length()) - // typing.cast(ibis_types.NumericValue, ibis.literal(2)) - ) - + str_val.length(), - ) - return str_val.lpad(lpad_amount, self._fillchar).rpad( - pad_length, self._fillchar - ) - - -class ReplaceStringOp(UnaryOp): - def __init__(self, pat: str, repl: str): - self._pat = pat - self._repl = repl - - def _as_ibis(self, x: ibis_types.Value): - pat_str_value = typing.cast( - ibis_types.StringValue, ibis_types.literal(self._pat) - ) - repl_str_value = typing.cast( - ibis_types.StringValue, ibis_types.literal(self._repl) - ) - - return typing.cast(ibis_types.StringValue, x).replace( - pat_str_value, repl_str_value - ) - - -class ReplaceRegexOp(UnaryOp): - def __init__(self, pat: str, repl: str): - self._pat = pat - self._repl = repl - - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_replace(self._pat, self._repl) +@dataclasses.dataclass(frozen=True) +class RegexReplaceStrOp(UnaryOp): + name: typing.ClassVar[str] = "str_rereplace" + pat: str + repl: str +@dataclasses.dataclass(frozen=True) class StartsWithOp(UnaryOp): - def __init__(self, pat: typing.Sequence[str]): - self._pat = pat - - def _as_ibis(self, x: ibis_types.Value): - any_match = None - for pat in self._pat: - pat_match = typing.cast(ibis_types.StringValue, x).startswith(pat) - if any_match is not None: - any_match = any_match | pat_match - else: - any_match = pat_match - return any_match if any_match is not None else ibis_types.literal(False) + name: typing.ClassVar[str] = "str_startswith" + pat: typing.Sequence[str] +@dataclasses.dataclass(frozen=True) class EndsWithOp(UnaryOp): - def __init__(self, pat: typing.Sequence[str]): - self._pat = pat - - def _as_ibis(self, x: ibis_types.Value): - any_match = None - for pat in self._pat: - pat_match = typing.cast(ibis_types.StringValue, x).endswith(pat) - if any_match is not None: - any_match = any_match | pat_match - else: - any_match = pat_match - return any_match if any_match is not None else ibis_types.literal(False) + name: typing.ClassVar[str] = "str_endswith" + pat: typing.Sequence[str] +@dataclasses.dataclass(frozen=True) class ZfillOp(UnaryOp): - def __init__(self, width: int): - self._width = width - - def _as_ibis(self, x: ibis_types.Value): - str_value = typing.cast(ibis_types.StringValue, x) - return ( - ibis.case() - .when( - str_value[0] == "-", - "-" - + StrPadOp(self._width - 1, "0", "left")._as_ibis(str_value.substr(1)), - ) - .else_(StrPadOp(self._width, "0", "left")._as_ibis(str_value)) - .end() - ) - - -## Datetime Ops -class DayOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).day().cast(ibis_dtypes.int64) - - -class DateOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).date() - - -class DayofweekOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x) - .day_of_week.index() - .cast(ibis_dtypes.int64) - ) + name: typing.ClassVar[str] = "str_zfill" + width: int -class HourOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).hour().cast(ibis_dtypes.int64) +@dataclasses.dataclass(frozen=True) +class StrFindOp(UnaryOp): + name: typing.ClassVar[str] = "str_find" + substr: str + start: typing.Optional[int] + end: typing.Optional[int] -class MinuteOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x).minute().cast(ibis_dtypes.int64) - ) +@dataclasses.dataclass(frozen=True) +class StrExtractOp(UnaryOp): + name: typing.ClassVar[str] = "str_extract" + pat: str + n: int = 1 -class MonthOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).month().cast(ibis_dtypes.int64) +@dataclasses.dataclass(frozen=True) +class StrSliceOp(UnaryOp): + name: typing.ClassVar[str] = "str_slice" + start: typing.Optional[int] + end: typing.Optional[int] -class QuarterOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x).quarter().cast(ibis_dtypes.int64) - ) +@dataclasses.dataclass(frozen=True) +class StrRepeatOp(UnaryOp): + name: typing.ClassVar[str] = "str_repeat" + repeats: int -class SecondOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x).second().cast(ibis_dtypes.int64) - ) +# Other parameterized unary operations +@dataclasses.dataclass(frozen=True) +class StructFieldOp(UnaryOp): + name: typing.ClassVar[str] = "struct_field" + name_or_index: str | int -class TimeOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).time() - - -class YearOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) - - -# Parameterized ops +@dataclasses.dataclass(frozen=True) class AsTypeOp(UnaryOp): - def __init__(self, to_type: dtypes.DtypeString | dtypes.Dtype): - self.to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(to_type) + name: typing.ClassVar[str] = "astype" + to_type: dtypes.DtypeString | dtypes.Dtype - def _as_ibis(self, x: ibis_types.Value): - if isinstance(x, ibis_types.NullScalar): - return ibis_types.null().cast(self.to_type) - return bigframes.dtypes.cast_ibis_value(x, self.to_type) - - -class MapOp(UnaryOp): - def __init__( - self, - mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...], - ): - self._mappings = mappings +@dataclasses.dataclass(frozen=True) +class IsInOp(UnaryOp): + name: typing.ClassVar[str] = "is_in" + values: typing.Tuple + match_nulls: bool = True - def _as_ibis(self, x: ibis_types.Value): - case = ibis.case() - for mapping in self._mappings: - case = case.when(x == mapping[0], mapping[1]) - return case.else_(x).end() +@dataclasses.dataclass(frozen=True) +class RemoteFunctionOp(UnaryOp): + name: typing.ClassVar[str] = "remote_function" + func: typing.Callable + apply_on_null: bool -class FindOp(UnaryOp): - def __init__(self, sub, start, end): - self._sub = sub - self._start = start - self._end = end - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).find( - self._sub, self._start, self._end - ) +@dataclasses.dataclass(frozen=True) +class MapOp(UnaryOp): + name = "map_values" + mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...] -class ExtractOp(UnaryOp): - def __init__(self, pat: str, n: int = 1): - self._pat = pat - self._n = n +# Operation Composition +# Meta-ops that do partial application or parameter remapping +# Subject to change, may convert to explicit tree +@dataclasses.dataclass(frozen=True) +class ApplyRight(UnaryOp): + name: typing.ClassVar[str] = "apply_right" + base_op: BinaryOp + right_scalar: typing.Any - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_extract(self._pat, self._n) +@dataclasses.dataclass(frozen=True) +class ApplyLeft(UnaryOp): + name: typing.ClassVar[str] = "apply_left" + base_op: BinaryOp + left_scalar: typing.Any -class SliceOp(UnaryOp): - def __init__(self, start, stop): - self._start = start - self._stop = stop - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x)[self._start : self._stop] +@dataclasses.dataclass(frozen=True) +class ApplyArg1(BinaryOp): + name: typing.ClassVar[str] = "apply_arg1" + base_op: TernaryOp + scalar: typing.Any -class IsInOp(UnaryOp): - def __init__(self, values, match_nulls: bool = True): - self._values = values - self._match_nulls = match_nulls +@dataclasses.dataclass(frozen=True) +class ApplyArg3(BinaryOp): + name: typing.ClassVar[str] = "apply_arg3" + base_op: TernaryOp + scalar: typing.Any - def _as_ibis(self, x: ibis_types.Value): - contains_nulls = any(is_null(value) for value in self._values) - matchable_ibis_values = [] - for item in self._values: - if not is_null(item): - try: - # we want values that *could* be cast to the dtype, but we don't want - # to actually cast it, as that could be lossy (eg float -> int) - item_inferred_type = ibis.literal(item).type() - if ( - x.type() == item_inferred_type - or x.type().is_numeric() - and item_inferred_type.is_numeric() - ): - matchable_ibis_values.append(item) - except TypeError: - pass - if self._match_nulls and contains_nulls: - return x.isnull() | x.isin(matchable_ibis_values) - else: - return x.isin(matchable_ibis_values) +@dataclasses.dataclass(frozen=True) +class ReverseArgsOp(BinaryOp): + name: typing.ClassVar[str] = "apply_reverse" + base_op: BinaryOp -class BinopPartialRight(UnaryOp): - def __init__(self, binop: BinaryOp, right_scalar: typing.Any): - self._binop = binop - self._right = dtypes.literal_to_ibis_scalar(right_scalar, validate=False) +def partial_left(op: BinaryOp, scalar: typing.Any) -> UnaryOp: + return ApplyLeft(base_op=op, left_scalar=scalar) - def _as_ibis(self, x): - return self._binop(x, self._right) +def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: + return ApplyRight(base_op=op, right_scalar=scalar) -class BinopPartialLeft(UnaryOp): - def __init__(self, binop: BinaryOp, left_scalar: typing.Any): - self._binop = binop - self._left = dtypes.literal_to_ibis_scalar(left_scalar, validate=False) - def _as_ibis(self, x): - return self._binop(self._left, x) +def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: + return ApplyArg1(base_op=op, scalar=scalar) -class RepeatOp(UnaryOp): - def __init__(self, repeats): - self._repeats = repeats +def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: + return ApplyArg3(base_op=op, scalar=scalar) - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).repeat(self._repeats) +def reverse(op: BinaryOp) -> BinaryOp: + return ReverseArgsOp(base_op=op) + + +# Binary Ops +fillna_op = create_binary_op(name="fillna") +cliplower_op = create_binary_op(name="clip_lower") +clipupper_op = create_binary_op(name="clip_upper") +coalesce_op = create_binary_op(name="coalesce") +## Math Ops +add_op = create_binary_op(name="add") +sub_op = create_binary_op(name="sub") +mul_op = create_binary_op(name="mul") +div_op = create_binary_op(name="div") +floordiv_op = create_binary_op(name="floordiv") +pow_op = create_binary_op(name="pow") +mod_op = create_binary_op(name="mod") +round_op = create_binary_op(name="round") +unsafe_pow_op = create_binary_op(name="unsafe_pow_op") +# Logical Ops +and_op = create_binary_op(name="and") +or_op = create_binary_op(name="or") + +## Comparison Ops +eq_op = create_binary_op(name="eq") +eq_null_match_op = create_binary_op(name="eq_nulls_match") +ne_op = create_binary_op(name="ne") +lt_op = create_binary_op(name="lt") +gt_op = create_binary_op(name="gt") +le_op = create_binary_op(name="le") +ge_op = create_binary_op(name="ge") + +## String Ops +strconcat_op = create_binary_op(name="strconcat") + +# Ternary Ops +where_op = create_ternary_op(name="where") +clip_op = create_ternary_op(name="clip") -class RemoteFunctionOp(UnaryOp): - def __init__(self, func: typing.Callable, apply_on_null=True): - if not hasattr(func, "bigframes_remote_function"): - raise TypeError( - f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" - ) - - self._func = func - self._apply_on_null = apply_on_null - - def _as_ibis(self, x: ibis_types.Value): - x_transformed = self._func(x) - if not self._apply_on_null: - x_transformed = where_op(x, x.isnull(), x_transformed) - return x_transformed - - -abs_op = AbsOp() -invert_op = InvertOp() -isnull_op = IsNullOp() -len_op = LenOp() -notnull_op = NotNullOp() -reverse_op = ReverseOp() -lower_op = LowerOp() -upper_op = UpperOp() -strip_op = StripOp() -isalnum_op = IsAlnumOp() -isalpha_op = IsAlphaOp() -isdecimal_op = IsDecimalOp() -isdigit_op = IsDigitOp() -isnumeric_op = IsNumericOp() -isspace_op = IsSpaceOp() -islower_op = IsLowerOp() -isupper_op = IsUpperOp() -rstrip_op = RstripOp() -lstrip_op = LstripOp() -hash_op = HashOp() -day_op = DayOp() -dayofweek_op = DayofweekOp() -date_op = DateOp() -hour_op = HourOp() -minute_op = MinuteOp() -month_op = MonthOp() -quarter_op = QuarterOp() -second_op = SecondOp() -time_op = TimeOp() -year_op = YearOp() -capitalize_op = CapitalizeOp() # Just parameterless unary ops for now # TODO: Parameter mappings NUMPY_TO_OP: typing.Final = { - np.sin: SinOp(), - np.cos: CosOp(), - np.tan: TanOp(), - np.arcsin: ArcsinOp(), - np.arccos: ArccosOp(), - np.arctan: ArctanOp(), - np.sinh: SinhOp(), - np.cosh: CoshOp(), - np.tanh: TanhOp(), - np.arcsinh: ArcsinhOp(), - np.arccosh: ArccoshOp(), - np.arctanh: ArctanhOp(), - np.exp: ExpOp(), - np.log: LnOp(), - np.log10: Log10Op(), - np.sqrt: SqrtOp(), - np.abs: AbsOp(), + np.sin: sin_op, + np.cos: cos_op, + np.tan: tan_op, + np.arcsin: arcsin_op, + np.arccos: arccos_op, + np.arctan: arctan_op, + np.sinh: sinh_op, + np.cosh: cosh_op, + np.tanh: tanh_op, + np.arcsinh: arcsinh_op, + np.arccosh: arccosh_op, + np.arctanh: arctanh_op, + np.exp: exp_op, + np.log: ln_op, + np.log10: log10_op, + np.sqrt: sqrt_op, + np.abs: abs_op, } -### Binary Ops -def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): - """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" - - def short_circuit_nulls_inner(binop): - @functools.wraps(binop) - def wrapped_binop(x: ibis_types.Value, y: ibis_types.Value): - if isinstance(x, ibis_types.NullScalar): - return ibis_types.null().cast(type_override or y.type()) - elif isinstance(y, ibis_types.NullScalar): - return ibis_types.null().cast(type_override or x.type()) - else: - return binop(x, y) - - return wrapped_binop - - return short_circuit_nulls_inner - - -def concat_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x_string = typing.cast(ibis_types.StringValue, x) - y_string = typing.cast(ibis_types.StringValue, y) - return x_string.concat(y_string) - - -def eq_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x == y - - -def eq_nulls_match_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" - left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) - right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) - return left == right - - -def ne_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x != y - - -def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): - return ibis.where( - where_value, - value, - ibis.null(), - ) - - -def and_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by - # implementing three-valued logic ourselves. For AND, when we encounter a - # NULL value, we only know when the result is FALSE, otherwise the result - # is unknown (NULL). See: truth table at - # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if isinstance(x, ibis_types.NullScalar): - return _null_or_value(y, y == ibis.literal(False)) - - if isinstance(y, ibis_types.NullScalar): - return _null_or_value(x, x == ibis.literal(False)) - - return typing.cast(ibis_types.BooleanValue, x) & typing.cast( - ibis_types.BooleanValue, y - ) - - -def or_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by - # implementing three-valued logic ourselves. For OR, when we encounter a - # NULL value, we only know when the result is TRUE, otherwise the result - # is unknown (NULL). See: truth table at - # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if isinstance(x, ibis_types.NullScalar): - return _null_or_value(y, y == ibis.literal(True)) - - if isinstance(y, ibis_types.NullScalar): - return _null_or_value(x, x == ibis.literal(True)) - - return typing.cast(ibis_types.BooleanValue, x) | typing.cast( - ibis_types.BooleanValue, y - ) - - -@short_circuit_nulls() -def add_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return ibis.null() - try: - # Could be string concatenation or numeric addition. - return x + y # type: ignore - except ibis.common.annotations.SignatureValidationError as exc: - left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) - right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) - raise TypeError( - f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" - ) from exc - - -@short_circuit_nulls() -def sub_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) - typing.cast( - ibis_types.NumericValue, y - ) - - -@short_circuit_nulls() -def mul_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) * typing.cast( - ibis_types.NumericValue, y - ) - - -@short_circuit_nulls(ibis_dtypes.float) -def div_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) / typing.cast( - ibis_types.NumericValue, y - ) - - -@short_circuit_nulls(ibis_dtypes.float) -def pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.type().is_integer() and y.type().is_integer(): - return _int_pow_op(x, y) - else: - return _float_pow_op(x, y) - - -@short_circuit_nulls(ibis_dtypes.float) -def unsafe_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - """For internal use only - where domain and overflow checks are not needed.""" - return typing.cast(ibis_types.NumericValue, x) ** typing.cast( - ibis_types.NumericValue, y - ) - - -def _int_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Need to avoid any error cases - should produce NaN instead - # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow - x_as_decimal = typing.cast( - ibis_types.NumericValue, - x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)), - ) - y_val = typing.cast(ibis_types.NumericValue, y) - - # BQ POW() function outputs FLOAT64, which can lose precision. - # Therefore, we do math in NUMERIC and cast back down after. - # Also, explicit bounds checks, pandas will silently overflow. - pow_result = x_as_decimal**y_val - overflow_cond = (pow_result > _ibis_num((2**63) - 1)) | ( - pow_result < _ibis_num(-(2**63)) - ) - - return ( - ibis.case() - .when((overflow_cond), ibis.null()) - .else_(pow_result.cast(ibis_dtypes.int64)) - .end() - ) - - -def _float_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. - # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow - x_val = typing.cast(ibis_types.NumericValue, x) - y_val = typing.cast(ibis_types.NumericValue, y) - - overflow_cond = (x_val != _ZERO) & ((y_val * x_val.abs().ln()) > _FLOAT64_EXP_BOUND) - - # Float64 lose integer precision beyond 2**53, beyond this insufficient precision to get parity - exp_too_big = y_val.abs() > _ibis_num(2**53) - # Treat very large exponents as +=INF - norm_exp = exp_too_big.ifelse(_INF * y_val.sign(), y_val) - - pow_result = x_val**norm_exp - - # This cast is dangerous, need to only excuted where y_val has been bounds-checked - # Ibis needs try_cast binding to bq safe_cast - exponent_is_whole = y_val.cast(ibis_dtypes.int64) == y_val - odd_exponent = (x_val < _ZERO) & ( - y_val.cast(ibis_dtypes.int64) % _ibis_num(2) == _ibis_num(1) - ) - infinite_base = x_val.abs() == _INF - - return ( - ibis.case() - # Might be able to do something more clever with x_val==0 case - .when(y_val == _ZERO, _ibis_num(1)) - .when( - x_val == _ibis_num(1), _ibis_num(1) - ) # Need to ignore exponent, even if it is NA - .when( - (x_val == _ZERO) & (y_val < _ZERO), _INF - ) # This case would error POW function in BQ - .when(infinite_base, pow_result) - .when( - exp_too_big, pow_result - ) # Bigquery can actually handle the +-inf cases gracefully - .when((x_val < _ZERO) & (~exponent_is_whole), _NAN) - .when( - overflow_cond, _INF * odd_exponent.ifelse(_ibis_num(-1), _ibis_num(1)) - ) # finite overflows would cause bq to error - .else_(pow_result) - .end() - ) - - -@short_circuit_nulls(ibis_dtypes.bool) -def lt_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x < y - - -@short_circuit_nulls(ibis_dtypes.bool) -def le_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x <= y - - -@short_circuit_nulls(ibis_dtypes.bool) -def gt_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x > y - - -@short_circuit_nulls(ibis_dtypes.bool) -def ge_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x >= y - - -def coalesce_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.name("name").equals(y.name("name")): - return x - else: - return ibis.coalesce(x, y) - - -@short_circuit_nulls(ibis_dtypes.int) -def floordiv_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x_numeric = typing.cast(ibis_types.NumericValue, x) - y_numeric = typing.cast(ibis_types.NumericValue, y) - floordiv_expr = x_numeric // y_numeric - - # DIV(N, 0) will error in bigquery, but needs to return 0 for int, and inf for float in BQ so we short-circuit in this case. - # Multiplying left by zero propogates nulls. - zero_result = _INF if (x.type().is_floating() or y.type().is_floating()) else _ZERO - return ( - ibis.case() - .when(y_numeric == _ZERO, zero_result * x_numeric) - .else_(floordiv_expr) - .end() - ) - - -def _is_float(x: ibis_types.Value): - return isinstance(x, (ibis_types.FloatingColumn, ibis_types.FloatingScalar)) - - -@short_circuit_nulls() -def mod_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - is_result_float = _is_float(x) | _is_float(y) - x_numeric = typing.cast( - ibis_types.NumericValue, - x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) - if is_result_float - else x, - ) - y_numeric = typing.cast( - ibis_types.NumericValue, - y.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) - if is_result_float - else y, - ) - # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. - op = y.op() - if isinstance(op, ibis.expr.operations.generic.Literal) and op.value == 0: - return ibis_types.null().cast(x.type()) - - bq_mod = x_numeric % y_numeric # Bigquery will maintain x sign here - if is_result_float: - bq_mod = typing.cast(ibis_types.NumericValue, bq_mod.cast(ibis_dtypes.float64)) - - # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) - return ( - ibis.case() - .when( - y_numeric == _ZERO, - _NAN * x_numeric if is_result_float else _ZERO * x_numeric, - ) # Dummy op to propogate nulls and type from x arg - .when( - (y_numeric < _ZERO) & (bq_mod > _ZERO), (y_numeric + bq_mod) - ) # Convert positive result to negative - .when( - (y_numeric > _ZERO) & (bq_mod < _ZERO), (y_numeric + bq_mod) - ) # Convert negative result to positive - .else_(bq_mod) - .end() - ) - - -def fillna_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x.fillna(typing.cast(ibis_types.Scalar, y)) - - -def round_op(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).round( - digits=typing.cast(ibis_types.IntegerValue, y) - ) - - -def clip_lower( - value: ibis_types.Value, - lower: ibis_types.Value, -): - return ibis.case().when(lower.isnull() | (value < lower), lower).else_(value).end() - - -def clip_upper( - value: ibis_types.Value, - upper: ibis_types.Value, -): - return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end() - - -def reverse(op: BinaryOp) -> BinaryOp: - return lambda x, y: op(y, x) - - -def partial_left(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return BinopPartialLeft(op, scalar) - - -def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return BinopPartialRight(op, scalar) - - NUMPY_TO_BINOP: typing.Final = { np.add: add_op, np.subtract: sub_op, @@ -1098,75 +394,3 @@ def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: np.divide: div_op, np.power: pow_op, } - - -# Ternary ops -def where_op( - original: ibis_types.Value, - condition: ibis_types.Value, - replacement: ibis_types.Value, -) -> ibis_types.Value: - """Returns x if y is true, otherwise returns z.""" - return ibis.case().when(condition, original).else_(replacement).end() # type: ignore - - -def clip_op( - original: ibis_types.Value, - lower: ibis_types.Value, - upper: ibis_types.Value, -) -> ibis_types.Value: - """Clips value to lower and upper bounds.""" - if isinstance(lower, ibis_types.NullScalar) and ( - not isinstance(upper, ibis_types.NullScalar) - ): - return ( - ibis.case() # type: ignore - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) - elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( - upper, ibis_types.NullScalar - ): - return ( - ibis.case() # type: ignore - .when(lower.isnull() | (original < lower), lower) - .else_(original) - .end() - ) - elif isinstance(lower, ibis_types.NullScalar) and ( - isinstance(upper, ibis_types.NullScalar) - ): - return original - else: - # Note: Pandas has unchanged behavior when upper bound and lower bound - # are flipped. - # This implementation requires that lower_bound < upper_bound. - return ( - ibis.case() # type: ignore - .when(lower.isnull() | (original < lower), lower) - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) - - -def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return lambda x, y: op(dtypes.literal_to_ibis_scalar(scalar, validate=False), x, y) - - -def partial_arg2(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return lambda x, y: op(x, dtypes.literal_to_ibis_scalar(scalar, validate=False), y) - - -def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return lambda x, y: op(x, y, dtypes.literal_to_ibis_scalar(scalar, validate=False)) - - -def is_null(value) -> bool: - # float NaN/inf should be treated as distinct from 'true' null values - return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) - - -def _ibis_num(number: float): - return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 8178ebfaea..452abf047c 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -396,6 +396,11 @@ def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: ) +class LastOp(WindowOp): + def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present(column.last(), window) + + class LastNonNullOp(WindowOp): @property def skips_nulls(self): diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 85ce1dd9e6..8989255f7e 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -71,7 +71,7 @@ def __init__( ) if dtype: block = block.multi_apply_unary_op( - block.value_columns, ops.AsTypeOp(dtype) + block.value_columns, ops.AsTypeOp(to_type=dtype) ) self._block = block @@ -162,7 +162,7 @@ def _apply_binary_op( block.select_column(result_id).assign_label(result_id, name) ) else: - partial_op = ops.BinopPartialRight(op, other) + partial_op = ops.ApplyRight(base_op=op, right_scalar=other) return self._apply_unary_op(partial_op) def _apply_corr_aggregation(self, other: series.Series) -> float: diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index a8a33beb57..3165e6f003 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -14,6 +14,9 @@ from __future__ import annotations +import datetime as dt +from typing import Optional + from bigframes.core import log_adapter import bigframes.operations as ops import bigframes.operations.base @@ -27,6 +30,7 @@ class DatetimeMethods( ): __doc__ = vendordt.DatetimeProperties.__doc__ + # Date accessors @property def day(self) -> series.Series: return self._apply_unary_op(ops.day_op) @@ -40,17 +44,26 @@ def date(self) -> series.Series: return self._apply_unary_op(ops.date_op) @property - def hour(self) -> series.Series: - return self._apply_unary_op(ops.hour_op) + def quarter(self) -> series.Series: + return self._apply_unary_op(ops.quarter_op) @property - def minute(self) -> series.Series: - return self._apply_unary_op(ops.minute_op) + def year(self) -> series.Series: + return self._apply_unary_op(ops.year_op) @property def month(self) -> series.Series: return self._apply_unary_op(ops.month_op) + # Time accessors + @property + def hour(self) -> series.Series: + return self._apply_unary_op(ops.hour_op) + + @property + def minute(self) -> series.Series: + return self._apply_unary_op(ops.minute_op) + @property def second(self) -> series.Series: return self._apply_unary_op(ops.second_op) @@ -60,9 +73,17 @@ def time(self) -> series.Series: return self._apply_unary_op(ops.time_op) @property - def quarter(self) -> series.Series: - return self._apply_unary_op(ops.quarter_op) + def tz(self) -> Optional[dt.timezone]: + # Assumption: pyarrow dtype + tz_string = self._dtype.pyarrow_dtype.tz + if tz_string == "UTC": + return dt.timezone.utc + elif tz_string is None: + return None + else: + raise ValueError(f"Unexpected timezone {tz_string}") @property - def year(self) -> series.Series: - return self._apply_unary_op(ops.year_op) + def unit(self) -> str: + # Assumption: pyarrow dtype + return self._dtype.pyarrow_dtype.unit diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 201b19abe8..2798f18b38 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -43,7 +43,7 @@ def find( start: Optional[int] = None, end: Optional[int] = None, ) -> series.Series: - return self._apply_unary_op(ops.FindOp(sub, start, end)) + return self._apply_unary_op(ops.StrFindOp(substr=sub, start=start, end=end)) def len(self) -> series.Series: return self._apply_unary_op(ops.len_op) @@ -61,7 +61,7 @@ def slice( start: Optional[int] = None, stop: Optional[int] = None, ) -> series.Series: - return self._apply_unary_op(ops.SliceOp(start, stop)) + return self._apply_unary_op(ops.StrSliceOp(start=start, end=stop)) def strip(self) -> series.Series: return self._apply_unary_op(ops.strip_op) @@ -114,7 +114,7 @@ def lstrip(self) -> series.Series: return self._apply_unary_op(ops.lstrip_op) def repeat(self, repeats: int) -> series.Series: - return self._apply_unary_op(ops.RepeatOp(repeats)) + return self._apply_unary_op(ops.StrRepeatOp(repeats=repeats)) def capitalize(self) -> series.Series: return self._apply_unary_op(ops.capitalize_op) @@ -122,38 +122,44 @@ def capitalize(self) -> series.Series: def match(self, pat, case=True, flags=0) -> series.Series: # \A anchors start of entire string rather than start of any line in multiline mode adj_pat = rf"\A{pat}" - return self.contains(adj_pat, case=case, flags=flags) + return self.contains(pat=adj_pat, case=case, flags=flags) def fullmatch(self, pat, case=True, flags=0) -> series.Series: # \A anchors start of entire string rather than start of any line in multiline mode # \z likewise anchors to the end of the entire multiline string adj_pat = rf"\A{pat}\z" - return self.contains(adj_pat, case=case, flags=flags) + return self.contains(pat=adj_pat, case=case, flags=flags) def get(self, i: int) -> series.Series: - return self._apply_unary_op(ops.StrGetOp(i)) + return self._apply_unary_op(ops.StrGetOp(i=i)) def pad(self, width, side="left", fillchar=" ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, side)) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side=side) + ) def ljust(self, width, fillchar=" ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, "right")) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side="right") + ) def rjust(self, width, fillchar=" ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, "left")) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side="left") + ) def contains( self, pat, case: bool = True, flags: int = 0, *, regex: bool = True ) -> series.Series: if not case: - return self.contains(pat, flags=flags | re.IGNORECASE, regex=True) + return self.contains(pat=pat, flags=flags | re.IGNORECASE, regex=True) if regex: re2flags = _parse_flags(flags) if re2flags: pat = re2flags + pat - return self._apply_unary_op(ops.ContainsRegexOp(pat)) + return self._apply_unary_op(ops.StrContainsRegexOp(pat=pat)) else: - return self._apply_unary_op(ops.ContainsStringOp(pat)) + return self._apply_unary_op(ops.StrContainsOp(pat=pat)) def extract(self, pat: str, flags: int = 0) -> df.DataFrame: re2flags = _parse_flags(flags) @@ -173,7 +179,9 @@ def extract(self, pat: str, flags: int = 0) -> df.DataFrame: ] label = labels[0] if labels else str(i) block, id = block.apply_unary_op( - self._value_column, ops.ExtractOp(pat, i + 1), result_label=label + self._value_column, + ops.StrExtractOp(pat=pat, n=i + 1), + result_label=label, ) results.append(id) block = block.select_columns(results) @@ -196,13 +204,13 @@ def replace( re2flags = _parse_flags(flags) if re2flags: patstr = re2flags + patstr - return self._apply_unary_op(ops.ReplaceRegexOp(patstr, repl)) + return self._apply_unary_op(ops.RegexReplaceStrOp(pat=patstr, repl=repl)) else: if is_compiled: raise ValueError( "Must set 'regex'=True if using compiled regex pattern." ) - return self._apply_unary_op(ops.ReplaceStringOp(patstr, repl)) + return self._apply_unary_op(ops.ReplaceStrOp(pat=patstr, repl=repl)) def startswith( self, @@ -210,7 +218,7 @@ def startswith( ) -> series.Series: if not isinstance(pat, tuple): pat = (pat,) - return self._apply_unary_op(ops.StartsWithOp(pat)) + return self._apply_unary_op(ops.StartsWithOp(pat=pat)) def endswith( self, @@ -218,13 +226,15 @@ def endswith( ) -> series.Series: if not isinstance(pat, tuple): pat = (pat,) - return self._apply_unary_op(ops.EndsWithOp(pat)) + return self._apply_unary_op(ops.EndsWithOp(pat=pat)) def zfill(self, width: int) -> series.Series: - return self._apply_unary_op(ops.ZfillOp(width)) + return self._apply_unary_op(ops.ZfillOp(width=width)) def center(self, width: int, fillchar: str = " ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, "both")) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side="both") + ) def cat( self, @@ -232,7 +242,7 @@ def cat( *, join: Literal["outer", "left"] = "left", ) -> series.Series: - return self._apply_binary_op(others, ops.concat_op, alignment=join) + return self._apply_binary_op(others, ops.strconcat_op, alignment=join) def _parse_flags(flags: int) -> Optional[str]: diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index b2ae98f378..0e00b781c9 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -14,10 +14,6 @@ from __future__ import annotations -import typing - -import ibis.expr.types as ibis_types - from bigframes.core import log_adapter import bigframes.dataframe import bigframes.operations @@ -26,19 +22,6 @@ import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors -class _StructField(bigframes.operations.UnaryOp): - def __init__(self, name_or_index: str | int): - self._name_or_index = name_or_index - - def _as_ibis(self, x: ibis_types.Value): - struct_value = typing.cast(ibis_types.StructValue, x) - if isinstance(self._name_or_index, str): - name = self._name_or_index - else: - name = struct_value.names[self._name_or_index] - return struct_value[name].name(name) - - @log_adapter.class_logger class StructAccessor( bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor @@ -46,7 +29,7 @@ class StructAccessor( __doc__ = vendoracessors.StructAccessor.__doc__ def field(self, name_or_index: str | int) -> bigframes.series.Series: - series = self._apply_unary_op(_StructField(name_or_index)) + series = self._apply_unary_op(bigframes.operations.StructFieldOp(name_or_index)) if isinstance(name_or_index, str): name = name_or_index else: diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 7386c4a2e7..0f060a23e8 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -295,12 +295,12 @@ def _perform_get_dummies_block_operations( if column_label == "": new_column_label = value new_block, new_id = block.apply_unary_op( - column_id, ops.BinopPartialLeft(ops.eq_op, value) + column_id, ops.ApplyLeft(ops.eq_op, value) ) intermediate_col_ids.append(new_id) block, _ = new_block.apply_unary_op( new_id, - ops.BinopPartialRight(ops.fillna_op, False), + ops.ApplyRight(ops.fillna_op, False), result_label=new_column_label, ) if dummy_na: @@ -486,20 +486,22 @@ def read_gbq( query_or_table: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, filters: vendored_pandas_gbq.FiltersType = (), use_cache: bool = True, + col_order: Iterable[str] = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( bigframes.session.Session.read_gbq, query_or_table, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, filters=filters, use_cache=use_cache, + col_order=col_order, ) @@ -520,18 +522,20 @@ def read_gbq_query( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( bigframes.session.Session.read_gbq_query, query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, use_cache=use_cache, + col_order=col_order, ) @@ -542,18 +546,20 @@ def read_gbq_table( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( bigframes.session.Session.read_gbq_table, query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, use_cache=use_cache, + col_order=col_order, ) diff --git a/bigframes/series.py b/bigframes/series.py index eefd2b755d..8f564423fc 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -139,6 +139,10 @@ def struct(self) -> structs.StructAccessor: def T(self) -> Series: return self.transpose() + @property + def _info_axis(self) -> indexes.Index: + return self.index + def transpose(self) -> Series: return self @@ -185,7 +189,7 @@ def rename( # Will throw if value type isn't compatible with index type. block, const_id = block.create_constant(v, dtype=idx_dtype) block, cond_id = block.apply_unary_op( - idx_id, ops.BinopPartialRight(ops.ne_op, k) + idx_id, ops.ApplyRight(base_op=ops.ne_op, right_scalar=k) ) block, new_idx_id = block.apply_ternary_op( idx_id, cond_id, const_id, ops.where_op @@ -263,7 +267,7 @@ def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], ) -> Series: - return self._apply_unary_op(bigframes.operations.AsTypeOp(dtype)) + return self._apply_unary_op(bigframes.operations.AsTypeOp(to_type=dtype)) def to_pandas( self, @@ -333,7 +337,7 @@ def drop( level_id = self._resolve_levels(level or 0)[0] if _is_list_like(index): block, inverse_condition_id = block.apply_unary_op( - level_id, ops.IsInOp(index, match_nulls=True) + level_id, ops.IsInOp(values=tuple(index), match_nulls=True) ) block, condition_id = block.apply_unary_op( inverse_condition_id, ops.invert_op @@ -467,7 +471,7 @@ def _regex_replace(self, to_replace: str, value: str): ) block, result_col = self._block.apply_unary_op( self._value_column, - ops.ReplaceRegexOp(to_replace, value), + ops.RegexReplaceStrOp(to_replace, value), result_label=self.name, ) return Series(block.select_column(result_col)) @@ -483,7 +487,7 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): return self.astype(result_type)._simple_replace(to_replace_list, value) block, cond = self._block.apply_unary_op( - self._value_column, ops.IsInOp(to_replace_list) + self._value_column, ops.IsInOp(tuple(to_replace_list)) ) block, result_col = block.apply_binary_op( cond, @@ -568,9 +572,9 @@ def isin(self, values) -> "Series" | None: f"isin(), you passed a [{type(values).__name__}]" ) - return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( - value=False - ) + return self._apply_unary_op( + ops.IsInOp(values=tuple(values), match_nulls=True) + ).fillna(value=False) def isna(self) -> "Series": return self._apply_unary_op(ops.isnull_op) @@ -926,9 +930,9 @@ def clip(self, lower, upper): if lower is None and upper is None: return self if lower is None: - return self._apply_binary_op(upper, ops.clip_upper, alignment="left") + return self._apply_binary_op(upper, ops.clipupper_op, alignment="left") if upper is None: - return self._apply_binary_op(lower, ops.clip_lower, alignment="left") + return self._apply_binary_op(lower, ops.cliplower_op, alignment="left") value_id, lower_id, upper_id, block = self._align3(lower, upper) block, result_id = block.apply_ternary_op( value_id, lower_id, upper_id, ops.clip_op @@ -1235,7 +1239,9 @@ def apply(self, func) -> Series: # to be applied before passing data to remote function, protecting from bad # inputs causing errors. reprojected_series = Series(self._block._force_reproject()) - return reprojected_series._apply_unary_op(ops.RemoteFunctionOp(func)) + return reprojected_series._apply_unary_op( + ops.RemoteFunctionOp(func=func, apply_on_null=True) + ) def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) @@ -1264,16 +1270,16 @@ def filter( block = self._block block, label_string_id = block.apply_unary_op( self._block.index_columns[0], - ops.AsTypeOp(pandas.StringDtype(storage="pyarrow")), + ops.AsTypeOp(to_type=pandas.StringDtype(storage="pyarrow")), ) if like is not None: block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsStringOp(pat=like) + label_string_id, ops.StrContainsOp(pat=like) ) else: # regex assert regex is not None block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsRegexOp(pat=regex) + label_string_id, ops.StrContainsRegexOp(pat=regex) ) block = block.filter(mask_id) @@ -1283,7 +1289,7 @@ def filter( # Behavior matches pandas 2.1+, older pandas versions would reindex block = self._block block, mask_id = block.apply_unary_op( - self._block.index_columns[0], ops.IsInOp(values=list(items)) + self._block.index_columns[0], ops.IsInOp(values=tuple(items)) ) block = block.filter(mask_id) block = block.select_columns([self._value_column]) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index fbe900106a..15c262afa7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -232,20 +232,28 @@ def read_gbq( query_or_table: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, filters: third_party_pandas_gbq.FiltersType = (), use_cache: bool = True, + col_order: Iterable[str] = (), # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. - query_or_table = self._filters_to_query(query_or_table, col_order, filters) + if columns and col_order: + raise ValueError( + "Must specify either columns (preferred) or col_order, not both" + ) + elif col_order: + columns = col_order + + query_or_table = self._filters_to_query(query_or_table, columns, filters) if _is_query(query_or_table): return self._read_gbq_query( query_or_table, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq", use_cache=use_cache, @@ -257,7 +265,7 @@ def read_gbq( return self._read_gbq_table( query_or_table, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq", use_cache=use_cache, @@ -388,9 +396,10 @@ def read_gbq_query( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> dataframe.DataFrame: """Turn a SQL query into a DataFrame. @@ -442,10 +451,17 @@ def read_gbq_query( """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. + if columns and col_order: + raise ValueError( + "Must specify either columns (preferred) or col_order, not both" + ) + elif col_order: + columns = col_order + return self._read_gbq_query( query=query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq_query", use_cache=use_cache, @@ -456,7 +472,7 @@ def _read_gbq_query( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, api_name: str = "read_gbq_query", use_cache: bool = True, @@ -492,7 +508,7 @@ def _read_gbq_query( return self.read_gbq_table( f"{destination.project}.{destination.dataset_id}.{destination.table_id}", index_col=index_cols, - col_order=col_order, + columns=columns, max_results=max_results, use_cache=use_cache, ) @@ -502,9 +518,10 @@ def read_gbq_table( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> dataframe.DataFrame: """Turn a BigQuery table into a DataFrame. @@ -521,10 +538,17 @@ def read_gbq_table( """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. + if columns and col_order: + raise ValueError( + "Must specify either columns (preferred) or col_order, not both" + ) + elif col_order: + columns = col_order + return self._read_gbq_table( query=query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq_table", use_cache=use_cache, @@ -583,7 +607,7 @@ def _read_gbq_table( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, api_name: str, use_cache: bool = True, @@ -602,10 +626,10 @@ def _read_gbq_table( table_ref, api_name=api_name, use_cache=use_cache ) - for key in col_order: + for key in columns: if key not in table_expression.columns: raise ValueError( - f"Column '{key}' of `col_order` not found in this table." + f"Column '{key}' of `columns` not found in this table." ) if isinstance(index_col, str): @@ -619,8 +643,8 @@ def _read_gbq_table( f"Column `{key}` of `index_col` not found in this table." ) - if col_order: - table_expression = table_expression.select([*index_cols, *col_order]) + if columns: + table_expression = table_expression.select([*index_cols, *columns]) # If the index is unique and sortable, then we don't need to generate # an ordering column. @@ -719,7 +743,7 @@ def _read_bigquery_load_job( *, job_config: bigquery.LoadJobConfig, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), ) -> dataframe.DataFrame: if isinstance(index_col, str): index_cols = [index_col] @@ -760,7 +784,7 @@ def _read_bigquery_load_job( return self.read_gbq_table( table_id, index_col=index_col, - col_order=col_order, + columns=columns, ) def read_gbq_model(self, model_name: str): @@ -959,13 +983,13 @@ def read_csv( if index_col is None: index_col = () - # usecols should only be an iterable of strings (column names) for use as col_order in read_gbq. - col_order: Tuple[Any, ...] = tuple() + # usecols should only be an iterable of strings (column names) for use as columns in read_gbq. + columns: Tuple[Any, ...] = tuple() if usecols is not None: if isinstance(usecols, Iterable) and all( isinstance(col, str) for col in usecols ): - col_order = tuple(col for col in usecols) + columns = tuple(col for col in usecols) else: raise NotImplementedError( "BigQuery engine only supports an iterable of strings for `usecols`. " @@ -1000,7 +1024,7 @@ def read_csv( table, job_config=job_config, index_col=index_col, - col_order=col_order, + columns=columns, ) else: if any(arg in kwargs for arg in ("chunksize", "iterator")): diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index 4770f12089..3695fc98e8 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -150,6 +150,17 @@ def create_temp_table( return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" +def set_table_expiration( + bqclient: bigquery.Client, + table_ref: bigquery.TableReference, + expiration: datetime.datetime, +) -> None: + """Set an expiration time for an existing BigQuery table.""" + table = bqclient.get_table(table_ref) + table.expires = expiration + bqclient.update_table(table, ["expires"]) + + # BigQuery REST API returns types in Legacy SQL format # https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL # names diff --git a/bigframes/version.py b/bigframes/version.py index 494335acd7..aeefff7a4b 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.18.0" +__version__ = "0.19.0" diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index 56d7bd1355..52a1c4e768 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -613,7 +613,7 @@ "source": [ "# Query 3 columns of interest from drug label dataset\n", "df = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", - " col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", + " columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", "\n", "# Exclude any rows with missing data\n", "df = df.dropna()\n", @@ -825,7 +825,7 @@ "source": [ "# Query 3 columns of interest from drug label dataset\n", "df_missing = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", - " col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", + " columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", "\n", "# Exclude any rows with missing data\n", "df_missing = df_missing.dropna()\n", diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index fd1b803eea..805cee4fec 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -17,11 +17,37 @@ from bigframes.ml import llm -def test_create_text_generator_model(palm2_text_generator_model): +def test_create_text_generator_model( + palm2_text_generator_model, dataset_id, bq_connection +): # Model creation doesn't return error assert palm2_text_generator_model is not None assert palm2_text_generator_model._bqml_model is not None + # save, load to ensure configuration was kept + reloaded_model = palm2_text_generator_model.to_gbq( + f"{dataset_id}.temp_text_model", replace=True + ) + assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.model_name == "text-bison" + assert reloaded_model.connection_name == bq_connection + + +def test_create_text_generator_32k_model( + palm2_text_generator_32k_model, dataset_id, bq_connection +): + # Model creation doesn't return error + assert palm2_text_generator_32k_model is not None + assert palm2_text_generator_32k_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = palm2_text_generator_32k_model.to_gbq( + f"{dataset_id}.temp_text_model", replace=True + ) + assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.model_name == "text-bison-32k" + assert reloaded_model.connection_name == bq_connection + @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_session( @@ -152,19 +178,39 @@ def test_text_generator_predict_with_params_success( assert all(series.str.len() > 20) -def test_create_embedding_generator_model(palm2_embedding_generator_model): +def test_create_embedding_generator_model( + palm2_embedding_generator_model, dataset_id, bq_connection +): # Model creation doesn't return error assert palm2_embedding_generator_model is not None assert palm2_embedding_generator_model._bqml_model is not None + # save, load to ensure configuration was kept + reloaded_model = palm2_embedding_generator_model.to_gbq( + f"{dataset_id}.temp_embedding_model", replace=True + ) + assert f"{dataset_id}.temp_embedding_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.model_name == "textembedding-gecko" + assert reloaded_model.connection_name == bq_connection + def test_create_embedding_generator_multilingual_model( palm2_embedding_generator_multilingual_model, + dataset_id, + bq_connection, ): # Model creation doesn't return error assert palm2_embedding_generator_multilingual_model is not None assert palm2_embedding_generator_multilingual_model._bqml_model is not None + # save, load to ensure configuration was kept + reloaded_model = palm2_embedding_generator_multilingual_model.to_gbq( + f"{dataset_id}.temp_embedding_model", replace=True + ) + assert f"{dataset_id}.temp_embedding_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.model_name == "textembedding-gecko-multilingual" + assert reloaded_model.connection_name == bq_connection + def test_create_text_embedding_generator_model_defaults(bq_connection): import bigframes.pandas as bpd diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 177194c7a8..3882491ecb 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -16,18 +16,22 @@ import pytest import bigframes.series -from tests.system.utils import assert_series_equal +from tests.system.utils import assert_series_equal, skip_legacy_pandas DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)] +DATE_COLUMNS = [ + ("datetime_col",), + ("timestamp_col",), + ("date_col",), +] @pytest.mark.parametrize( ("col_name",), - DATETIME_COL_NAMES, + DATE_COLUMNS, ) -def test_day(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_day(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.day.to_pandas() @@ -43,9 +47,8 @@ def test_day(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -def test_date(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_date(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.date.to_pandas() @@ -59,11 +62,10 @@ def test_date(scalars_dfs, col_name): @pytest.mark.parametrize( ("col_name",), - DATETIME_COL_NAMES, + DATE_COLUMNS, ) -def test_dayofweek(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_dayofweek(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.dayofweek.to_pandas() @@ -76,9 +78,8 @@ def test_dayofweek(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -def test_hour(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_hour(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.hour.to_pandas() @@ -94,9 +95,8 @@ def test_hour(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -def test_minute(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_minute(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.minute.to_pandas() @@ -110,11 +110,10 @@ def test_minute(scalars_dfs, col_name): @pytest.mark.parametrize( ("col_name",), - DATETIME_COL_NAMES, + DATE_COLUMNS, ) -def test_month(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_month(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.month.to_pandas() @@ -128,11 +127,10 @@ def test_month(scalars_dfs, col_name): @pytest.mark.parametrize( ("col_name",), - DATETIME_COL_NAMES, + DATE_COLUMNS, ) -def test_quarter(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_quarter(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.quarter.to_pandas() @@ -148,9 +146,8 @@ def test_quarter(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -def test_second(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_second(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.second.to_pandas() @@ -166,9 +163,8 @@ def test_second(scalars_dfs, col_name): ("col_name",), DATETIME_COL_NAMES, ) -def test_time(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_time(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.time.to_pandas() @@ -182,11 +178,10 @@ def test_time(scalars_dfs, col_name): @pytest.mark.parametrize( ("col_name",), - DATETIME_COL_NAMES, + DATE_COLUMNS, ) -def test_year(scalars_dfs, col_name): - if pd.__version__.startswith("1."): - pytest.skip("Pyarrow datetime objects not support in pandas 1.x.") +@skip_legacy_pandas +def test_dt_year(scalars_dfs, col_name): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] bf_result = bf_series.dt.year.to_pandas() @@ -196,3 +191,31 @@ def test_year(scalars_dfs, col_name): pd_result.astype(pd.Int64Dtype()), bf_result, ) + + +@pytest.mark.parametrize( + ("col_name",), + DATETIME_COL_NAMES, +) +@skip_legacy_pandas +def test_dt_tz(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df[col_name] + bf_result = bf_series.dt.tz + pd_result = scalars_pandas_df[col_name].dt.tz + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("col_name",), + DATETIME_COL_NAMES, +) +@skip_legacy_pandas +def test_dt_unit(scalars_dfs, col_name): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df[col_name] + bf_result = bf_series.dt.unit + pd_result = scalars_pandas_df[col_name].dt.unit + + assert bf_result == pd_result diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index cb2e4f94fa..9557475b46 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3140,17 +3140,65 @@ def test_df___array__(scalars_df_index, scalars_pandas_df_index): ) -def test_getattr_attribute_error_when_pandas_has(scalars_df_index): - # asof is implemented in pandas but not in bigframes +def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index): + # swapaxes is implemented in pandas but not in bigframes with pytest.raises(AttributeError): - scalars_df_index.asof() + scalars_df_index.swapaxes() -def test_getattr_attribute_error(scalars_df_index): +def test_df_getattr_attribute_error(scalars_df_index): with pytest.raises(AttributeError): scalars_df_index.not_a_method() +def test_df_getattr_axes(): + df = dataframe.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + assert isinstance(df.index, bigframes.core.indexes.Index) + assert isinstance(df.columns, pandas.Index) + assert isinstance(df.my_column, series.Series) + + +def test_df_setattr_index(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + pd_df.index = [4, 5] + bf_df.index = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_columns(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + pd_df.columns = [4, 5, 6] + bf_df.columns = [4, 5, 6] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_modify_column(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + pd_df.my_column = [4, 5] + bf_df.my_column = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 6f1b31b48e..05c9d22372 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -317,6 +317,55 @@ def test_to_gbq_w_None_column_names( ) +@pytest.mark.parametrize( + "clustering_columns", + [ + pytest.param(["int64_col", "geography_col"]), + pytest.param( + ["float64_col"], + marks=pytest.mark.xfail(raises=google.api_core.exceptions.BadRequest), + ), + pytest.param( + ["int64_col", "int64_col"], + marks=pytest.mark.xfail(raises=ValueError), + ), + ], +) +def test_to_gbq_w_clustering( + scalars_df_default_index, + dataset_id, + bigquery_client, + clustering_columns, +): + """Test the `to_gbq` API for creating clustered tables.""" + destination_table = ( + f"{dataset_id}.test_to_gbq_clustering_{'_'.join(clustering_columns)}" + ) + + scalars_df_default_index.to_gbq( + destination_table, clustering_columns=clustering_columns + ) + table = bigquery_client.get_table(destination_table) + + assert list(table.clustering_fields) == clustering_columns + assert table.expires is None + + +def test_to_gbq_w_clustering_no_destination( + scalars_df_default_index, + bigquery_client, +): + """Test the `to_gbq` API for creating clustered tables without destination.""" + clustering_columns = ["int64_col", "geography_col"] + destination_table = scalars_df_default_index.to_gbq( + clustering_columns=clustering_columns + ) + table = bigquery_client.get_table(destination_table) + + assert list(table.clustering_fields) == clustering_columns + assert table.expires is not None + + def test_to_gbq_w_invalid_destination_table(scalars_df_index): with pytest.raises(ValueError): scalars_df_index.to_gbq("table_id") diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 2d4e1f0204..d767b30bd6 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -16,7 +16,37 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas + + +@skip_legacy_pandas +def test_read_pandas_multi_index_axes(): + index = pandas.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + columns = pandas.MultiIndex.from_arrays( + [ + pandas.Index([6, 87], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Bonjour le monde!", "_une_chaîne_de_caractères"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1columns 1", "_1new_index 2"], + ) + pandas_df = pandas.DataFrame( + [[1, 2], [3, 4]], index=index, columns=columns, dtype=pandas.Int64Dtype() + ) + bf_df = bpd.DataFrame(pandas_df) + + pandas.testing.assert_frame_equal(bf_df.to_pandas(), pandas_df) # Row Multi-index tests @@ -204,6 +234,7 @@ def test_series_multi_index_droplevel(scalars_df_index, scalars_pandas_df_index, (1, 0), ([0, 1], 0), ([True, None], 1), + ((0, True), None), ], ) def test_multi_index_drop(scalars_df_index, scalars_pandas_df_index, labels, level): diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 282c0d68eb..a79ddb64cd 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -185,6 +185,38 @@ def test_concat_dataframe_mismatched_columns(scalars_dfs, how): pd.testing.assert_frame_equal(bf_result, pd_result) +def test_concat_dataframe_upcasting(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_input1 = scalars_df[["int64_col", "float64_col", "int64_too"]].set_index( + "int64_col", drop=True + ) + bf_input1.columns = ["a", "b"] + bf_input2 = scalars_df[["int64_too", "int64_col", "float64_col"]].set_index( + "float64_col", drop=True + ) + bf_input2.columns = ["a", "b"] + bf_result = bpd.concat([bf_input1, bf_input2], join="outer") + bf_result = bf_result.to_pandas() + + bf_input1 = ( + scalars_pandas_df[["int64_col", "float64_col", "int64_too"]] + .set_index("int64_col", drop=True) + .set_axis(["a", "b"], axis=1) + ) + bf_input2 = ( + scalars_pandas_df[["int64_too", "int64_col", "float64_col"]] + .set_index("float64_col", drop=True) + .set_axis(["a", "b"], axis=1) + ) + pd_result = pd.concat( + [bf_input1, bf_input2], + join="outer", + ) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("how",), [ diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 8ce442376a..2d9c332de1 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -52,7 +52,7 @@ def test_read_gbq_tokyo( @pytest.mark.parametrize( - ("query_or_table", "col_order"), + ("query_or_table", "columns"), [ pytest.param( "{scalars_table_id}", ["bool_col", "int64_col"], id="two_cols_in_table" @@ -79,16 +79,16 @@ def test_read_gbq_tokyo( ), ], ) -def test_read_gbq_w_col_order( +def test_read_gbq_w_columns( session: bigframes.Session, scalars_table_id: str, query_or_table: str, - col_order: List[str], + columns: List[str], ): df = session.read_gbq( - query_or_table.format(scalars_table_id=scalars_table_id), col_order=col_order + query_or_table.format(scalars_table_id=scalars_table_id), columns=columns ) - assert df.columns.tolist() == col_order + assert df.columns.tolist() == columns @pytest.mark.parametrize( diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c3794c550e..bca18bd0b7 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -11,9 +11,10 @@ """ from __future__ import annotations -from typing import Literal, Mapping, Optional, Sequence, Union +from typing import Hashable, Iterable, Literal, Mapping, Optional, Sequence, Union import numpy as np +import pandas as pd from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame @@ -307,6 +308,7 @@ def to_gbq( if_exists: Optional[Literal["fail", "replace", "append"]] = None, index: bool = True, ordering_id: Optional[str] = None, + clustering_columns: Union[pd.Index, Iterable[Hashable]] = (), ) -> str: """Write a DataFrame to a BigQuery table. @@ -336,6 +338,16 @@ def to_gbq( [2 rows x 2 columns] + Write a DataFrame to a BigQuery table with clustering columns: + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}) + >>> clustering_cols = ['col1', 'col3'] + >>> df.to_gbq( + ... "bigframes-dev.birds.test-clusters", + ... if_exists="replace", + ... clustering_columns=clustering_cols, + ... ) + 'bigframes-dev.birds.test-clusters' + Args: destination_table (Optional[str]): Name of table to be written, in the form ``dataset.tablename`` @@ -364,6 +376,15 @@ def to_gbq( If set, write the ordering of the DataFrame as a column in the result table with this name. + clustering_columns (Union[pd.Index, Iterable[Hashable]], default ()): + Specifies the columns for clustering in the BigQuery table. The order + of columns in this list is significant for clustering hierarchy. Index + columns may be included in clustering if the `index` parameter is set + to True, and their names are specified in this. These index columns, + if included, precede DataFrame columns in the clustering order. The + clustering order within the Index/DataFrame columns follows the order + specified in `clustering_columns`. + Returns: str: The fully-qualified ID for the written table, in the form @@ -1086,7 +1107,7 @@ def drop( Args: labels: - Index or column labels to drop. + Index or column labels to drop. A tuple will be used as a single label and not treated as a list-like. axis: Whether to drop labels from the index (0 or 'index') or columns (1 or 'columns'). diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 72b947f96c..2ca51f6493 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -627,6 +627,48 @@ def copy(self): def ffill(self, *, limit: Optional[int] = None): """Fill NA/NaN values by propagating the last valid observation to next valid. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4]], + ... columns=list("ABCD")).astype("Float64") + >>> df + A B C D + 0 2.0 0.0 + 1 3.0 4.0 1.0 + 2 + 3 3.0 4.0 + + [4 rows x 4 columns] + + Fill NA/NaN values in DataFrames: + + >>> df.ffill() + A B C D + 0 2.0 0.0 + 1 3.0 4.0 1.0 + 2 3.0 4.0 1.0 + 3 3.0 3.0 4.0 + + [4 rows x 4 columns] + + + Fill NA/NaN values in Series: + + >>> series = bpd.Series([1, np.nan, 2, 3]) + >>> series.ffill() + 0 1.0 + 1 1.0 + 2 2.0 + 3 3.0 + dtype: Float64 + Args: limit : int, default None If method is specified, this is the maximum number of consecutive diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 2b4a326317..9490f4608b 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -94,3 +94,22 @@ def year(self): """The year of the datetime.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def tz(self): + """Return the timezone. + + Returns: + datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + """ + + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def unit(self) -> str: + """Returns the unit of time precision. + + Returns: + Unit as string (eg. "us"). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index dc8bcc1f77..8e2c9f092d 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -17,10 +17,11 @@ def read_gbq( query_or_table: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, filters: FiltersType = (), use_cache: bool = True, + col_order: Iterable[str] = (), ): """Loads a DataFrame from BigQuery. @@ -77,11 +78,11 @@ def read_gbq( Reading data with `columns` and `filters` parameters: - >>> col_order = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] + >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])] >>> df = bpd.read_gbq( ... "bigquery-public-data.baseball.games_wide", - ... col_order=col_order, + ... columns=columns, ... filters=filters, ... ) >>> df.head(1) @@ -97,7 +98,7 @@ def read_gbq( `project.dataset.tablename` or `dataset.tablename`. index_col (Iterable[str] or str): Name of result column(s) to use for index in results DataFrame. - col_order (Iterable[str]): + columns (Iterable[str]): List of BigQuery column names in the desired order for results DataFrame. max_results (Optional[int], default None): @@ -113,6 +114,8 @@ def read_gbq( is to be conducted. use_cache (bool, default True): Whether to cache the query inputs. Default to True. + col_order (Iterable[str]): + Alias for columns, retained for backwards compatibility. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table.