diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index af05f4423c..4fd6488c9c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,7 +35,8 @@ repos: hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.1.1 + rev: v1.10.0 hooks: - id: mypy additional_dependencies: [types-requests, types-tabulate, pandas-stubs] + args: ["--check-untyped-defs", "--explicit-package-bases", '--exclude="^third_party"', "--ignore-missing-imports"] diff --git a/CHANGELOG.md b/CHANGELOG.md index 568efa68b4..34ab012fd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,40 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.7.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.6.0...v1.7.0) (2024-05-20) + + +### Features + +* `read_gbq_query` supports `filters` ([9386373](https://github.com/googleapis/python-bigquery-dataframes/commit/9386373538c1e7827e2210c4fd9946312821b54d)) +* `read_gbq` suggests a correct column name when one is not found ([9386373](https://github.com/googleapis/python-bigquery-dataframes/commit/9386373538c1e7827e2210c4fd9946312821b54d)) +* Add `DefaultIndexKind.NULL` to use as `index_col` in `read_gbq*`, creating an indexless DataFrame/Series ([#662](https://github.com/googleapis/python-bigquery-dataframes/issues/662)) ([29e4886](https://github.com/googleapis/python-bigquery-dataframes/commit/29e4886d41e3d615bc493cf3a104ef1b0698ece8)) +* Bigframes.bigquery.array_agg(SeriesGroupBy|DataFrameGroupby) ([#663](https://github.com/googleapis/python-bigquery-dataframes/issues/663)) ([412f28b](https://github.com/googleapis/python-bigquery-dataframes/commit/412f28bf7551430473690160a2a1c4c2f133539e)) +* To_datetime supports utc=False for string inputs ([#579](https://github.com/googleapis/python-bigquery-dataframes/issues/579)) ([adf9889](https://github.com/googleapis/python-bigquery-dataframes/commit/adf98892e499f4a9c85162c38f56ca5634a1ba6d)) + + +### Bug Fixes + +* `read_gbq_table` respects primary keys even when `filters` are set ([#689](https://github.com/googleapis/python-bigquery-dataframes/issues/689)) ([9386373](https://github.com/googleapis/python-bigquery-dataframes/commit/9386373538c1e7827e2210c4fd9946312821b54d)) +* Fix type error in test_cluster ([#698](https://github.com/googleapis/python-bigquery-dataframes/issues/698)) ([14d81c1](https://github.com/googleapis/python-bigquery-dataframes/commit/14d81c17505f9a09439a874ff855aec6f95fc0d1)) +* Improve escaping of literals and identifiers ([#682](https://github.com/googleapis/python-bigquery-dataframes/issues/682)) ([da9b136](https://github.com/googleapis/python-bigquery-dataframes/commit/da9b136df08b243c8515946f7c0d7b591b8fcbdc)) +* Properly identify non-unique index in tables without primary keys ([#699](https://github.com/googleapis/python-bigquery-dataframes/issues/699)) ([6e0f4d8](https://github.com/googleapis/python-bigquery-dataframes/commit/6e0f4d8c76f78dc26f4aa1880dd67ebdb638bb5e)) +* Remove a usage of the `resource` package when not available, such as on Windows ([#681](https://github.com/googleapis/python-bigquery-dataframes/issues/681)) ([96243f2](https://github.com/googleapis/python-bigquery-dataframes/commit/96243f23a1571001509d0d01c16c1e72e47e0d23)) +* The imported samples error and use peek() ([#688](https://github.com/googleapis/python-bigquery-dataframes/issues/688)) ([1a0b744](https://github.com/googleapis/python-bigquery-dataframes/commit/1a0b744c5aacdd8ba4eececf7b0a374808e8672c)) + + +### Performance Improvements + +* Don't run query immediately from `read_gbq_table` if `filters` is set ([9386373](https://github.com/googleapis/python-bigquery-dataframes/commit/9386373538c1e7827e2210c4fd9946312821b54d)) +* Use a `LIMIT` clause when `max_results` is set ([9386373](https://github.com/googleapis/python-bigquery-dataframes/commit/9386373538c1e7827e2210c4fd9946312821b54d)) + + +### Documentation + +* Add code snippets for imported onnx tutorials ([#684](https://github.com/googleapis/python-bigquery-dataframes/issues/684)) ([cb36e46](https://github.com/googleapis/python-bigquery-dataframes/commit/cb36e468d1c2a34c2231638124f3c8d9052f032b)) +* Add code snippets for imported tensorflow model ([#679](https://github.com/googleapis/python-bigquery-dataframes/issues/679)) ([b02c401](https://github.com/googleapis/python-bigquery-dataframes/commit/b02c401614eeab9cbf2e9a7c648b3d0a4e741b97)) +* Use `class_weight="balanced"` in the logistic regression prediction tutorial ([#678](https://github.com/googleapis/python-bigquery-dataframes/issues/678)) ([b951549](https://github.com/googleapis/python-bigquery-dataframes/commit/b95154908fd7838e499a2af0fc3760c5ab33358f)) + ## [1.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.5.0...v1.6.0) (2024-05-13) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 197e0a83b5..6c9c04dca7 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -22,9 +22,13 @@ import typing +import bigframes.constants as constants +import bigframes.core.groupby as groupby import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe import bigframes.series as series @@ -52,9 +56,66 @@ def array_length(series: series.Series) -> series.Series: 2 2 dtype: Int64 + Args: + series (bigframes.series.Series): + A Series with array columns. + Returns: bigframes.series.Series: A Series of integer values indicating the length of each element in the Series. """ return series._apply_unary_op(ops.len_op) + + +def array_agg( + obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, +) -> series.Series | dataframe.DataFrame: + """Group data and create arrays from selected columns, omitting NULLs to avoid + BigQuery errors (NULLs not allowed in arrays). + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + For a SeriesGroupBy object: + + >>> lst = ['a', 'a', 'b', 'b', 'a'] + >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst) + >>> bbq.array_agg(s.groupby(level=0)) + a [1. 2.] + b [3. 4.] + dtype: list[pyarrow] + + For a DataFrameGroupBy object: + + >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + >>> df = bpd.DataFrame(l, columns=["a", "b", "c"]) + >>> bbq.array_agg(df.groupby(by=["b"])) + a c + b + 1.0 [2] [3] + 2.0 [1 1] [3 2] + + [2 rows x 2 columns] + + Args: + obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): + A GroupBy object to be applied the function. + + Returns: + bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or + DataFrame containing aggregated array columns, and indexed by the + original group columns. + """ + if isinstance(obj, groupby.SeriesGroupBy): + return obj._aggregate(agg_ops.ArrayAggOp()) + elif isinstance(obj, groupby.DataFrameGroupBy): + return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False) + else: + raise ValueError( + f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}" + ) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 79c6bb6495..0a2936419f 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -456,6 +456,19 @@ def join( return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node)) return ArrayValue(join_node) + def try_align_as_projection( + self, + other: ArrayValue, + join_type: join_def.JoinType, + mappings: typing.Tuple[join_def.JoinColumnMapping, ...], + ) -> typing.Optional[ArrayValue]: + left_side = bigframes.core.rewrite.SquashedSelect.from_node(self.node) + right_side = bigframes.core.rewrite.SquashedSelect.from_node(other.node) + result = left_side.maybe_merge(right_side, join_type, mappings) + if result is not None: + return ArrayValue(result.expand()) + return None + def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: assert len(column_ids) > 0 for column_id in column_ids: diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index e12e6bf054..eaee2e2cc0 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -597,9 +597,11 @@ def skew( block = block.select_columns(skew_ids).with_column_labels(column_labels) if not grouping_column_ids: - # When ungrouped, stack everything into single column so can be returned as series - block = block.stack() - block = block.drop_levels([block.index_columns[0]]) + # When ungrouped, transpose result row into a series + # perform transpose last, so as to not invalidate cache + block, index_col = block.create_constant(None, None) + block = block.set_index([index_col]) + return block.transpose(original_row_index=pd.Index([None])) return block @@ -637,9 +639,11 @@ def kurt( block = block.select_columns(kurt_ids).with_column_labels(column_labels) if not grouping_column_ids: - # When ungrouped, stack everything into single column so can be returned as series - block = block.stack() - block = block.drop_levels([block.index_columns[0]]) + # When ungrouped, transpose result row into a series + # perform transpose last, so as to not invalidate cache + block, index_col = block.create_constant(None, None) + block = block.set_index([index_col]) + return block.transpose(original_row_index=pd.Index([None])) return block @@ -820,7 +824,8 @@ def idxmax(block: blocks.Block) -> blocks.Block: def _idx_extrema( block: blocks.Block, min_or_max: typing.Literal["min", "max"] ) -> blocks.Block: - if len(block.index_columns) != 1: + block._throw_if_null_index("idx") + if len(block.index_columns) > 1: # TODO: Need support for tuple dtype raise NotImplementedError( f"idxmin not support for multi-index. {constants.FEEDBACK_LINK}" diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 58b8515418..0bbb8a0b61 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -51,6 +51,7 @@ import bigframes.core.utils as utils import bigframes.core.window_spec as window_specs import bigframes.dtypes +import bigframes.exceptions import bigframes.features import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -120,19 +121,11 @@ def __init__( f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) - # If no index columns are set, create one. - # - # Note: get_index_cols_and_uniqueness in - # bigframes/session/_io/bigquery/read_gbq_table.py depends on this - # being as sequential integer index column. If this default behavior - # ever changes, please also update get_index_cols_and_uniqueness so - # that users who explicitly request a sequential integer index can - # still get one. if len(index_columns) == 0: - new_index_col_id = guid.generate_guid() - expr = expr.promote_offsets(new_index_col_id) - index_columns = [new_index_col_id] - + warnings.warn( + "Creating object with Null Index. Null Index is a preview feature.", + category=bigframes.exceptions.PreviewWarning, + ) self._index_columns = tuple(index_columns) # Index labels don't need complicated hierarchical access so can store as tuple self._index_labels = ( @@ -517,7 +510,8 @@ def _copy_index_to_pandas(self, df: pd.DataFrame): Warning: This method modifies ``df`` inplace. """ - if self.index_columns: + # Note: If BigQuery DataFrame has null index, a default one will be created for the local materialization. + if len(self.index_columns) > 0: df.set_index(list(self.index_columns), inplace=True) # Pandas names is annotated as list[str] rather than the more # general Sequence[Label] that BigQuery DataFrames has. @@ -1093,16 +1087,25 @@ def aggregate( aggregate_labels = self._get_labels_for_columns( [agg[0] for agg in aggregations] ) + names: typing.List[Label] = [] - for by_col_id in by_column_ids: - if by_col_id in self.value_columns: - names.append(self.col_id_to_label[by_col_id]) - else: - names.append(self.col_id_to_index_name[by_col_id]) + if len(by_column_ids) == 0: + label_id = guid.generate_guid() + result_expr = result_expr.assign_constant(label_id, 0, pd.Int64Dtype()) + index_columns = (label_id,) + names = [None] + else: + index_columns = tuple(by_column_ids) # type: ignore + for by_col_id in by_column_ids: + if by_col_id in self.value_columns: + names.append(self.col_id_to_label[by_col_id]) + else: + names.append(self.col_id_to_index_name[by_col_id]) + return ( Block( result_expr, - index_columns=by_column_ids, + index_columns=index_columns, column_labels=aggregate_labels, index_labels=names, ), @@ -1226,8 +1229,9 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): ) labels = self._get_labels_for_columns(self.value_columns) + # TODO(b/340896143): fix type error expr = expr.unpivot( - row_labels=labels, + row_labels=labels, # type: ignore index_col_ids=index_col_ids, unpivot_columns=unpivot_columns, ) @@ -1255,11 +1259,12 @@ def explode( expr = self.expr.explode(column_ids) if ignore_index: + new_index_ids = guid.generate_guid() return Block( - expr.drop_columns(self.index_columns), + expr.drop_columns(self.index_columns).promote_offsets(new_index_ids), column_labels=self.column_labels, # Initiates default index creation using the block constructor. - index_columns=[], + index_columns=[new_index_ids], ) else: return Block( @@ -1422,7 +1427,8 @@ def retrieve_repr_request_results( computed_df, query_job = head_block.to_pandas() formatted_df = computed_df.set_axis(self.column_labels, axis=1) # we reset the axis and substitute the bf index name(s) for the default - formatted_df.index.names = self.index.names # type: ignore + if len(self.index.names) > 0: + formatted_df.index.names = self.index.names # type: ignore return formatted_df, count, query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: @@ -1906,9 +1912,26 @@ def join( other: Block, *, how="left", - sort=False, + sort: bool = False, block_identity_join: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: + """ + Join two blocks objects together, and provide mappings between source columns and output columns. + + Args: + other (Block): + The right operand of the join operation + how (str): + Describes the join type. 'inner', 'outer', 'left', or 'right' + sort (bool): + if true will sort result by index + block_identity_join (bool): + If true, will not convert join to a projection (implicitly assuming unique indices) + + Returns: + Block, (left_mapping, right_mapping): Result block and mappers from input column ids to result column ids. + """ + if not isinstance(other, Block): # TODO(swast): We need to improve this error message to be more # actionable for the user. For example, it's possible they @@ -1922,6 +1945,16 @@ def join( raise NotImplementedError( f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) + # Special case for null index, + if ( + (self.index.nlevels == other.index.nlevels == 0) + and not sort + and not block_identity_join + ): + return join_indexless(self, other, how=how) + + self._throw_if_null_index("join") + other._throw_if_null_index("join") if self.index.nlevels == other.index.nlevels == 1: return join_mono_indexed( self, other, how=how, sort=sort, block_identity_join=block_identity_join @@ -2070,6 +2103,12 @@ def _is_monotonic( self._stats_cache[column_name].update({op_name: result}) return result + def _throw_if_null_index(self, opname: str): + if len(self.index_columns) == 0: + raise bigframes.exceptions.NullIndexError( + f"Cannot do {opname} without an index. Set an index using set_index." + ) + def _get_rows_as_json_values(self) -> Block: # We want to preserve any ordering currently present before turning to # direct SQL manipulation. We will restore the ordering when we rebuild @@ -2096,7 +2135,7 @@ def _get_rows_as_json_values(self) -> Block: ) column_names.append(serialized_column_name) - column_names_csv = sql.csv(column_names, quoted=True) + column_names_csv = sql.csv(map(sql.simple_literal, column_names)) # index columns count index_columns_count = len(self.index_columns) @@ -2108,22 +2147,22 @@ def _get_rows_as_json_values(self) -> Block: # types of the columns to serialize for the row column_types = list(self.index.dtypes) + list(self.dtypes) - column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True) + column_types_csv = sql.csv( + [sql.simple_literal(str(typ)) for typ in column_types] + ) # row dtype to use for deserializing the row as pandas series pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types) if pandas_row_dtype is None: pandas_row_dtype = "object" - pandas_row_dtype = sql.quote(str(pandas_row_dtype)) + pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype)) # create a json column representing row through SQL manipulation row_json_column_name = guid.generate_guid() select_columns = ( [ordering_column_name] + list(self.index_columns) + [row_json_column_name] ) - select_columns_csv = sql.csv( - [sql.column_reference(col) for col in select_columns] - ) + select_columns_csv = sql.csv([sql.identifier(col) for col in select_columns]) json_sql = f"""\ With T0 AS ( {textwrap.indent(expr_sql, " ")} @@ -2136,7 +2175,7 @@ def _get_rows_as_json_values(self) -> Block: "values", [{column_references_csv}], "indexlength", {index_columns_count}, "dtype", {pandas_row_dtype} - ) AS {row_json_column_name} FROM T0 + ) AS {sql.identifier(row_json_column_name)} FROM T0 ) SELECT {select_columns_csv} FROM T1 """ @@ -2210,6 +2249,10 @@ def __repr__(self) -> str: def to_pandas(self) -> pd.Index: """Executes deferred operations and downloads the results.""" + if len(self.column_ids) == 0: + raise bigframes.exceptions.NullIndexError( + "Cannot materialize index, as this object does not have an index. Set index column(s) using set_index." + ) # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) dtypes = dict(zip(index_columns, self.dtypes)) @@ -2251,6 +2294,53 @@ def is_uniquely_named(self: BlockIndexProperties): return len(set(self.names)) == len(self.names) +def join_indexless( + left: Block, + right: Block, + *, + how="left", +) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: + """Joins two blocks""" + left_expr = left.expr + right_expr = right.expr + left_mappings = [ + join_defs.JoinColumnMapping( + source_table=join_defs.JoinSide.LEFT, + source_id=id, + destination_id=guid.generate_guid(), + ) + for id in left_expr.column_ids + ] + right_mappings = [ + join_defs.JoinColumnMapping( + source_table=join_defs.JoinSide.RIGHT, + source_id=id, + destination_id=guid.generate_guid(), + ) + for id in right_expr.column_ids + ] + combined_expr = left_expr.try_align_as_projection( + right_expr, + join_type=how, + mappings=(*left_mappings, *right_mappings), + ) + if combined_expr is None: + raise bigframes.exceptions.NullIndexError( + "Cannot implicitly align objects. Set an explicit index using set_index." + ) + get_column_left = {m.source_id: m.destination_id for m in left_mappings} + get_column_right = {m.source_id: m.destination_id for m in right_mappings} + block = Block( + combined_expr, + column_labels=[*left.column_labels, *right.column_labels], + index_columns=(), + ) + return ( + block, + (get_column_left, get_column_right), + ) + + def join_mono_indexed( left: Block, right: Block, diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 98d296c779..c0b0562a54 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -34,13 +34,14 @@ def compile_aggregate( aggregate: ex.Aggregation, bindings: typing.Dict[str, ibis_types.Value], + order_by: typing.Sequence[ibis_types.Value] = [], ) -> ibis_types.Value: if isinstance(aggregate, ex.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) - return compile_unary_agg( - aggregate.op, - input, - ) + if aggregate.op.can_order_by: + return compile_ordered_unary_agg(aggregate.op, input, order_by=order_by) + else: + return compile_unary_agg(aggregate.op, input) elif isinstance(aggregate, ex.BinaryAggregation): left = scalar_compiler.compile_expression(aggregate.left, bindings=bindings) right = scalar_compiler.compile_expression(aggregate.right, bindings=bindings) @@ -66,7 +67,8 @@ def compile_analytic( @functools.singledispatch def compile_binary_agg( op: agg_ops.WindowOp, - input: ibis_types.Column, + left: ibis_types.Column, + right: ibis_types.Column, window: Optional[window_spec.WindowSpec] = None, ) -> ibis_types.Value: raise ValueError(f"Can't compile unrecognized operation: {op}") @@ -81,9 +83,24 @@ def compile_unary_agg( raise ValueError(f"Can't compile unrecognized operation: {op}") +@functools.singledispatch +def compile_ordered_unary_agg( + op: agg_ops.WindowOp, + input: ibis_types.Column, + window: Optional[window_spec.WindowSpec] = None, + order_by: typing.Sequence[ibis_types.Value] = [], +) -> ibis_types.Value: + raise ValueError(f"Can't compile unrecognized operation: {op}") + + def numeric_op(operation): @functools.wraps(operation) - def constrained_op(op, column: ibis_types.Column, window=None): + def constrained_op( + op, + column: ibis_types.Column, + window=None, + order_by: typing.Sequence[ibis_types.Value] = [], + ): if column.type().is_boolean(): column = typing.cast( ibis_types.NumericColumn, column.cast(ibis_dtypes.int64) @@ -104,7 +121,9 @@ def constrained_op(op, column: ibis_types.Column, window=None): @compile_unary_agg.register @numeric_op def _( - op: agg_ops.SumOp, column: ibis_types.NumericColumn, window=None + op: agg_ops.SumOp, + column: ibis_types.NumericColumn, + window=None, ) -> ibis_types.NumericValue: # Will be null if all inputs are null. Pandas defaults to zero sum though. bq_sum = _apply_window_if_present(column.sum(), window) @@ -116,7 +135,9 @@ def _( @compile_unary_agg.register @numeric_op def _( - op: agg_ops.MedianOp, column: ibis_types.NumericColumn, window=None + op: agg_ops.MedianOp, + column: ibis_types.NumericColumn, + window=None, ) -> ibis_types.NumericValue: # PERCENTILE_CONT has very few allowed windows. For example, "window # framing clause is not allowed for analytic function percentile_cont". @@ -134,7 +155,9 @@ def _( @compile_unary_agg.register @numeric_op def _( - op: agg_ops.ApproxQuartilesOp, column: ibis_types.NumericColumn, window=None + op: agg_ops.ApproxQuartilesOp, + column: ibis_types.NumericColumn, + window=None, ) -> ibis_types.NumericValue: # PERCENTILE_CONT has very few allowed windows. For example, "window # framing clause is not allowed for analytic function percentile_cont". @@ -151,7 +174,9 @@ def _( @compile_unary_agg.register @numeric_op def _( - op: agg_ops.QuantileOp, column: ibis_types.NumericColumn, window=None + op: agg_ops.QuantileOp, + column: ibis_types.NumericColumn, + window=None, ) -> ibis_types.NumericValue: return _apply_window_if_present(column.quantile(op.q), window) @@ -159,7 +184,10 @@ def _( @compile_unary_agg.register @numeric_op def _( - op: agg_ops.MeanOp, column: ibis_types.NumericColumn, window=None + op: agg_ops.MeanOp, + column: ibis_types.NumericColumn, + window=None, + # order_by: typing.Sequence[ibis_types.Value] = [], ) -> ibis_types.NumericValue: return _apply_window_if_present(column.mean(), window) @@ -167,7 +195,9 @@ def _( @compile_unary_agg.register @numeric_op def _( - op: agg_ops.ProductOp, column: ibis_types.NumericColumn, window=None + op: agg_ops.ProductOp, + column: ibis_types.NumericColumn, + window=None, ) -> ibis_types.NumericValue: # Need to short-circuit as log with zeroes is illegal sql is_zero = cast(ibis_types.BooleanColumn, (column == 0)) @@ -202,30 +232,50 @@ def _( @compile_unary_agg.register -def _(op: agg_ops.MaxOp, column: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.MaxOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: return _apply_window_if_present(column.max(), window) @compile_unary_agg.register -def _(op: agg_ops.MinOp, column: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.MinOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: return _apply_window_if_present(column.min(), window) @compile_unary_agg.register @numeric_op -def _(op: agg_ops.StdOp, x: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.StdOp, + x: ibis_types.Column, + window=None, +) -> ibis_types.Value: return _apply_window_if_present(cast(ibis_types.NumericColumn, x).std(), window) @compile_unary_agg.register @numeric_op -def _(op: agg_ops.VarOp, x: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.VarOp, + x: ibis_types.Column, + window=None, +) -> ibis_types.Value: return _apply_window_if_present(cast(ibis_types.NumericColumn, x).var(), window) @compile_unary_agg.register @numeric_op -def _(op: agg_ops.PopVarOp, x: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.PopVarOp, + x: ibis_types.Column, + window=None, +) -> ibis_types.Value: return _apply_window_if_present( cast(ibis_types.NumericColumn, x).var(how="pop"), window ) @@ -233,13 +283,19 @@ def _(op: agg_ops.PopVarOp, x: ibis_types.Column, window=None) -> ibis_types.Val @compile_unary_agg.register def _( - op: agg_ops.CountOp, column: ibis_types.Column, window=None + op: agg_ops.CountOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.IntegerValue: return _apply_window_if_present(column.count(), window) @compile_unary_agg.register -def _(op: agg_ops.CutOp, x: ibis_types.Column, window=None): +def _( + op: agg_ops.CutOp, + x: ibis_types.Column, + window=None, +): out = ibis.case() if isinstance(op.bins, int): col_min = _apply_window_if_present(x.min(), window) @@ -292,7 +348,9 @@ def _(op: agg_ops.CutOp, x: ibis_types.Column, window=None): @compile_unary_agg.register @numeric_op def _( - self: agg_ops.QcutOp, column: ibis_types.Column, window=None + self: agg_ops.QcutOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.IntegerValue: if isinstance(self.quantiles, int): quantiles_ibis = dtypes.literal_to_ibis_scalar(self.quantiles) @@ -322,21 +380,27 @@ def _( @compile_unary_agg.register def _( - op: agg_ops.NuniqueOp, column: ibis_types.Column, window=None + op: agg_ops.NuniqueOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.IntegerValue: return _apply_window_if_present(column.nunique(), window) @compile_unary_agg.register def _( - op: agg_ops.AnyValueOp, column: ibis_types.Column, window=None + op: agg_ops.AnyValueOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.IntegerValue: return _apply_window_if_present(column.arbitrary(), window) @compile_unary_agg.register def _( - op: agg_ops.RankOp, column: ibis_types.Column, window=None + op: agg_ops.RankOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.IntegerValue: # Ibis produces 0-based ranks, while pandas creates 1-based ranks return _apply_window_if_present(ibis.rank(), window) + 1 @@ -344,7 +408,9 @@ def _( @compile_unary_agg.register def _( - op: agg_ops.DenseRankOp, column: ibis_types.Column, window=None + op: agg_ops.DenseRankOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.IntegerValue: # Ibis produces 0-based ranks, while pandas creates 1-based ranks return _apply_window_if_present(column.dense_rank(), window) + 1 @@ -357,7 +423,9 @@ def _(op: agg_ops.FirstOp, column: ibis_types.Column, window=None) -> ibis_types @compile_unary_agg.register def _( - op: agg_ops.FirstNonNullOp, column: ibis_types.Column, window=None + op: agg_ops.FirstNonNullOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.Value: return _apply_window_if_present( vendored_ibis_ops.FirstNonNullValue(column).to_expr(), window # type: ignore @@ -365,13 +433,19 @@ def _( @compile_unary_agg.register -def _(op: agg_ops.LastOp, column: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.LastOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: return _apply_window_if_present(column.last(), window) @compile_unary_agg.register def _( - op: agg_ops.LastNonNullOp, column: ibis_types.Column, window=None + op: agg_ops.LastNonNullOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.Value: return _apply_window_if_present( vendored_ibis_ops.LastNonNullValue(column).to_expr(), window # type: ignore @@ -379,7 +453,11 @@ def _( @compile_unary_agg.register -def _(op: agg_ops.ShiftOp, column: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.ShiftOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: if op.periods == 0: # No-op return column if op.periods > 0: @@ -388,7 +466,11 @@ def _(op: agg_ops.ShiftOp, column: ibis_types.Column, window=None) -> ibis_types @compile_unary_agg.register -def _(op: agg_ops.DiffOp, column: ibis_types.Column, window=None) -> ibis_types.Value: +def _( + op: agg_ops.DiffOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: shifted = compile_unary_agg(agg_ops.ShiftOp(op.periods), column, window) if column.type().is_boolean(): return cast(ibis_types.BooleanColumn, column) != cast( @@ -404,7 +486,9 @@ def _(op: agg_ops.DiffOp, column: ibis_types.Column, window=None) -> ibis_types. @compile_unary_agg.register def _( - op: agg_ops.AllOp, column: ibis_types.Column, window=None + op: agg_ops.AllOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.BooleanValue: # BQ will return null for empty column, result would be true in pandas. result = _is_true(column).all() @@ -416,7 +500,9 @@ def _( @compile_unary_agg.register def _( - op: agg_ops.AnyOp, column: ibis_types.Column, window=None + op: agg_ops.AnyOp, + column: ibis_types.Column, + window=None, ) -> ibis_types.BooleanValue: # BQ will return null for empty column, result would be false in pandas. result = _is_true(column).any() @@ -426,6 +512,31 @@ def _( ) +@compile_ordered_unary_agg.register +def _( + op: agg_ops.ArrayAggOp, + column: ibis_types.Column, + window=None, + order_by: typing.Sequence[ibis_types.Value] = [], +) -> ibis_types.ArrayValue: + # BigQuery doesn't currently support using ARRAY_AGG with both window and aggregate + # functions simultaneously. Some aggregate functions (or its equivalent syntax) + # are more important, such as: + # - `IGNORE NULLS` are required to avoid an raised error if the final result + # contains a NULL element. + # - `ORDER BY` are required for the default ordering mode. + # To keep things simpler, windowing support is skipped for now. + if window is not None: + raise NotImplementedError( + f"ArrayAgg with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + + return vendored_ibis_ops.ArrayAggregate( + column, + order_by=order_by, + ).to_expr() + + @compile_binary_agg.register def _( op: agg_ops.CorrOp, left: ibis_types.Column, right: ibis_types.Column, window=None diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cc1d6baaa1..552061f612 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -16,9 +16,8 @@ import abc import functools import itertools -import textwrap import typing -from typing import Collection, Iterable, Literal, Optional, Sequence +from typing import Collection, Literal, Optional, Sequence import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis @@ -40,6 +39,7 @@ OrderingExpression, ) import bigframes.core.schema as schemata +import bigframes.core.sql from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec import bigframes.dtypes import bigframes.operations.aggregations as agg_ops @@ -102,6 +102,12 @@ def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: def _ibis_bindings(self) -> dict[str, ibis_types.Value]: return {col: self._get_ibis_column(col) for col in self.column_ids} + @property + @abc.abstractmethod + def is_ordered_ir(self: T) -> bool: + """Whether it is a OrderedIR or UnorderedIR.""" + ... + @abc.abstractmethod def filter(self: T, predicate: ex.Expression) -> T: """Filter the table on a given expression, the predicate must be a boolean expression.""" @@ -163,6 +169,53 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), ) + def _aggregate_base( + self, + table: ibis_types.Table, + order_by: typing.Sequence[ibis_types.Value] = [], + aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]] = [], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> OrderedIR: + assert not self.is_ordered_ir or len(order_by) > 0 + + bindings = {col: table[col] for col in self.column_ids} + stats = { + col_out: agg_compiler.compile_aggregate( + aggregate, bindings, order_by=order_by + ) + for aggregate, col_out in aggregations + } + if by_column_ids: + result = table.group_by(by_column_ids).aggregate(**stats) + # Must have deterministic ordering, so order by the unique "by" column + ordering = ExpressionOrdering( + tuple([ascending_over(column_id) for column_id in by_column_ids]), + total_ordering_columns=frozenset(by_column_ids), + ) + columns = tuple(result[key] for key in result.columns) + expr = OrderedIR(result, columns=columns, ordering=ordering) + if dropna: + for column_id in by_column_ids: + expr = expr._filter(expr._get_ibis_column(column_id).notnull()) + return expr + else: + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless + # as other ops(join etc.) expect it. + # TODO: Maybe can make completely empty + ordering = ExpressionOrdering( + ordering_value_columns=tuple([]), + total_ordering_columns=frozenset([]), + ) + return OrderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + # Ibis Implementations class UnorderedIR(BaseIbisIR): @@ -174,6 +227,10 @@ def __init__( ): super().__init__(table, columns, predicates) + @property + def is_ordered_ir(self) -> bool: + return False + def builder(self): """Creates a mutable builder for expressions.""" # Since ArrayValue is intended to be immutable (immutability offers @@ -310,44 +367,17 @@ def aggregate( Apply aggregations to the expression. Arguments: aggregations: input_column_id, operation, output_column_id tuples - by_column_id: column id of the aggregation key, this is preserved through the transform + by_column_ids: column ids of the aggregation key, this is preserved through + the transform dropna: whether null keys should be dropped + Returns: + OrderedIR: the grouping key is a unique-valued column and has ordering + information. """ table = self._to_ibis_expr() - bindings = {col: table[col] for col in self.column_ids} - stats = { - col_out: agg_compiler.compile_aggregate(aggregate, bindings) - for aggregate, col_out in aggregations - } - if by_column_ids: - result = table.group_by(by_column_ids).aggregate(**stats) - # Must have deterministic ordering, so order by the unique "by" column - ordering = ExpressionOrdering( - tuple([ascending_over(column_id) for column_id in by_column_ids]), - total_ordering_columns=frozenset(by_column_ids), - ) - columns = tuple(result[key] for key in result.columns) - expr = OrderedIR(result, columns=columns, ordering=ordering) - if dropna: - for column_id in by_column_ids: - expr = expr._filter(expr._get_ibis_column(column_id).notnull()) - # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr._project_offsets() - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - # TODO: Maybe can make completely empty - ordering = ExpressionOrdering( - ordering_value_columns=tuple([]), - total_ordering_columns=frozenset([]), - ) - return OrderedIR( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, - ) + return self._aggregate_base( + table, aggregations=aggregations, by_column_ids=by_column_ids, dropna=dropna + ) def _uniform_sampling(self, fraction: float) -> UnorderedIR: """Sampling the table on given fraction. @@ -526,6 +556,10 @@ def __init__( if not ordering_valid: raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + @property + def is_ordered_ir(self) -> bool: + return True + @classmethod def from_pandas( cls, @@ -535,7 +569,8 @@ def from_pandas( """ Builds an in-memory only (SQL only) expr from a pandas dataframe. - Assumed that the dataframe has unique string column names and bigframes-suppported dtypes. + Assumed that the dataframe has unique string column names and bigframes-suppported + dtypes. """ # ibis memtable cannot handle NA, must convert to None @@ -572,7 +607,8 @@ def _hidden_column_ids(self) -> typing.Sequence[str]: @property def _ibis_order(self) -> Sequence[ibis_types.Value]: - """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" + """Returns a sequence of ibis values which can be directly used to order a + table expression. Has direction modifiers applied.""" return _convert_ordering_to_table_values( {**self._column_names, **self._hidden_ordering_column_names}, self._ordering.all_ordering_columns, @@ -604,6 +640,44 @@ def reversed(self) -> OrderedIR: expr_builder.ordering = self._ordering.with_reverse() return expr_builder.build() + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> OrderedIR: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_ids: column ids of the aggregation key, this is preserved through + the transform + dropna: whether null keys should be dropped + Returns: + OrderedIR + """ + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + + all_columns = { + column_name: table[column_name] + for column_name in { + **self._column_names, + **self._hidden_ordering_column_names, + } + } + order_by = _convert_ordering_to_table_values( + all_columns, + self._ordering.all_ordering_columns, + ) + + return self._aggregate_base( + table, + order_by=order_by, + aggregations=aggregations, + by_column_ids=by_column_ids, + dropna=dropna, + ) + def _uniform_sampling(self, fraction: float) -> OrderedIR: """Sampling the table on given fraction. @@ -821,15 +895,13 @@ def to_sql( ) ) output_columns = [ - col_id_overrides.get(col) if (col in col_id_overrides) else col - for col in baked_ir.column_ids + col_id_overrides.get(col, col) for col in baked_ir.column_ids ] - selection = ", ".join(map(lambda col_id: f"`{col_id}`", output_columns)) + sql = bigframes.core.sql.select_from_subquery(output_columns, sql) - sql = textwrap.dedent(f"SELECT {selection}\n" "FROM (\n" f"{sql}\n" ")\n") # Single row frames may not have any ordering columns if len(baked_ir._ordering.all_ordering_columns) > 0: - order_by_clause = baked_ir._ordering_clause( + order_by_clause = bigframes.core.sql.ordering_clause( baked_ir._ordering.all_ordering_columns ) sql += f"{order_by_clause}\n" @@ -843,22 +915,6 @@ def to_sql( ) return typing.cast(str, sql) - def _ordering_clause(self, ordering: Iterable[OrderingExpression]) -> str: - parts = [] - for col_ref in ordering: - asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" - null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" - ordering_expr = col_ref.scalar_expression - # We don't know how to compile scalar expressions in isolation - if ordering_expr.is_const: - # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. - continue - if not isinstance(ordering_expr, ex.UnboundVariableExpression): - raise ValueError("Expected direct column reference.") - part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" - parts.append(part) - return f"ORDER BY {' ,'.join(parts)}" - def _to_ibis_expr( self, *, @@ -1087,7 +1143,8 @@ def _bake_ordering(self) -> OrderedIR: ) def _project_offsets(self) -> OrderedIR: - """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" + """Create a new expression that contains offsets. Should only be executed when + offsets are needed for an operations. Has no effect on expression semantics.""" if self._ordering.is_sequential: return self table = self._to_ibis_expr( diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index a68023d13d..a9908192f3 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -155,10 +155,18 @@ def compile_rowcount(node: nodes.RowCountNode, ordered: bool = True): @_compile_node.register def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): - result = compile_unordered_ir(node.child).aggregate( - node.aggregations, node.by_column_ids, node.dropna + has_ordered_aggregation_ops = any( + aggregate.op.can_order_by for aggregate, _ in node.aggregations ) - return result if ordered else result.to_unordered() + if ordered and has_ordered_aggregation_ops: + return compile_ordered_ir(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna + ) + else: + result = compile_unordered_ir(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna + ) + return result if ordered else result.to_unordered() @_compile_node.register @@ -180,10 +188,10 @@ def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): @_compile_node.register -def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True): +def compile_explode(node: nodes.ExplodeNode, ordered: bool = True): return compile_node(node.child, ordered).explode(node.column_ids) @_compile_node.register -def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): +def compile_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 8a44844fba..e8e5a1f3ac 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -17,6 +17,7 @@ import functools import typing +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.common.exceptions import ibis.expr.datatypes as ibis_dtypes @@ -737,7 +738,7 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): return struct_value[name].name(name) -def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: +def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: if not isinstance(x, ibis_types.IntegerValue) and not isinstance( x, ibis_types.FloatingValue ): @@ -779,7 +780,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. unit = "us" - x_converted = numeric_to_datatime(x, unit) + x_converted = numeric_to_datetime(x, unit) if to_type == ibis_dtypes.timestamp: return x_converted.cast(ibis_dtypes.Timestamp()) elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): @@ -818,23 +819,39 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() == ibis_dtypes.str: - x = x.to_timestamp(op.format) if op.format else timestamp(x) - elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): + return vendored_ibis_ops.SafeCastToDatetime(x).to_expr() + else: + # Numerical inputs. if op.format: - raise NotImplementedError( - f"Format parameter is not supported for Timestamp input types. {constants.FEEDBACK_LINK}" - ) - return x - elif x.type() != ibis_dtypes.timestamp: + x = x.cast(ibis_dtypes.str).to_timestamp(op.format) + else: + # The default unit is set to "ns" (nanoseconds) for consistency + # with pandas, where "ns" is the default unit for datetime operations. + unit = op.unit or "ns" + x = numeric_to_datetime(x, unit) + + return x.cast(ibis_dtypes.Timestamp(None)) + + +@scalar_op_compiler.register_unary_op(ops.ToTimestampOp, pass_op=True) +def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp): + if x.type() == ibis_dtypes.str: + x = ( + typing.cast(ibis_types.StringValue, x).to_timestamp(op.format) + if op.format + else timestamp(x) + ) + else: + # Numerical inputs. if op.format: x = x.cast(ibis_dtypes.str).to_timestamp(op.format) else: # The default unit is set to "ns" (nanoseconds) for consistency # with pandas, where "ns" is the default unit for datetime operations. unit = op.unit or "ns" - x = numeric_to_datatime(x, unit) + x = numeric_to_datetime(x, unit) - return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) + return x.cast(ibis_dtypes.Timestamp(timezone="UTC")) @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index bc03bd1df0..582141d539 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -160,7 +160,8 @@ def __getitem__(self, key): columns = key[1] if isinstance(columns, pd.Series) and columns.dtype == "bool": - columns = df.columns[columns] + # TODO(b/340892590): fix type error + columns = df.columns[columns] # type: ignore return df[columns] diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 569dae4ffc..7f2c56c20a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -91,7 +91,8 @@ def __new__( from bigframes.core.indexes.multi import MultiIndex klass = MultiIndex if len(block._index_columns) > 1 else cls - result = typing.cast(Index, object.__new__(klass)) + # TODO(b/340893286): fix type error + result = typing.cast(Index, object.__new__(klass)) # type: ignore result._query_job = None result._block = block return result @@ -100,6 +101,11 @@ def __new__( def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: + if len(frame._block.index_columns) == 0: + raise bigframes.exceptions.NullIndexError( + "Cannot access index properties with Null Index. Set an index using set_index." + ) + frame._block._throw_if_null_index("from_frame") index = Index(frame._block) index._linked_frame = frame return index diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index e3a07c04b4..15999c0558 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -98,12 +98,12 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): self.root, self.columns, self.predicate, new_ordering, self.reverse_root ) - def maybe_join( + def can_join( self, right: SquashedSelect, join_def: join_defs.JoinDefinition - ) -> Optional[SquashedSelect]: + ) -> bool: if join_def.type == "cross": # Cannot convert cross join to projection - return None + return False r_exprs_by_id = {id: expr for expr, id in right.columns} l_exprs_by_id = {id: expr for expr, id in self.columns} @@ -113,10 +113,17 @@ def maybe_join( if (self.root != right.root) or any( l_expr != r_expr for l_expr, r_expr in zip(l_join_exprs, r_join_exprs) ): + return False + return True + + def maybe_merge( + self, + right: SquashedSelect, + join_type: join_defs.JoinType, + mappings: Tuple[join_defs.JoinColumnMapping, ...], + ) -> Optional[SquashedSelect]: + if self.root != right.root: return None - - join_type = join_def.type - # Mask columns and remap names to expected schema lselection = self.columns rselection = right.columns @@ -136,7 +143,7 @@ def maybe_join( lselection = tuple((apply_mask(expr, lmask), id) for expr, id in lselection) if rmask is not None: rselection = tuple((apply_mask(expr, rmask), id) for expr, id in rselection) - new_columns = remap_names(join_def, lselection, rselection) + new_columns = remap_names(mappings, lselection, rselection) # Reconstruct ordering reverse_root = self.reverse_root @@ -201,20 +208,27 @@ def maybe_squash_projection(node: nodes.BigFrameNode) -> nodes.BigFrameNode: def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: left_side = SquashedSelect.from_node(join_node.left_child) right_side = SquashedSelect.from_node(join_node.right_child) - joined = left_side.maybe_join(right_side, join_node.join) - if joined is not None: - return joined.expand() + if left_side.can_join(right_side, join_node.join): + merged = left_side.maybe_merge( + right_side, join_node.join.type, join_node.join.mappings + ) + assert ( + merged is not None + ), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." + return merged.expand() else: return join_node def remap_names( - join: join_defs.JoinDefinition, lselection: Selection, rselection: Selection + mappings: Tuple[join_defs.JoinColumnMapping, ...], + lselection: Selection, + rselection: Selection, ) -> Selection: new_selection: Selection = tuple() l_exprs_by_id = {id: expr for expr, id in lselection} r_exprs_by_id = {id: expr for expr, id in rselection} - for mapping in join.mappings: + for mapping in mappings: if mapping.source_table == join_defs.JoinSide.LEFT: expr = l_exprs_by_id[mapping.source_id] else: # Right diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 31ee5f9064..c1e319b860 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -11,49 +11,161 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations """ Utility functions for SQL construction. """ -from typing import Iterable +import datetime +import math +import textwrap +from typing import Iterable, TYPE_CHECKING +# Literals and identifiers matching this pattern can be unquoted +unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$" -def quote(value: str): - """Return quoted input string.""" - # Let's use repr which also escapes any special characters - # - # >>> for val in [ - # ... "123", - # ... "str with no special chars", - # ... "str with special chars.,'\"/\\" - # ... ]: - # ... print(f"{val} -> {repr(val)}") - # ... - # 123 -> '123' - # str with no special chars -> 'str with no special chars' - # str with special chars.,'"/\ -> 'str with special chars.,\'"/\\' +if TYPE_CHECKING: + import google.cloud.bigquery as bigquery - return repr(value) + import bigframes.core.ordering -def column_reference(column_name: str): +### Writing SQL Values (literals, column references, table references, etc.) +def simple_literal(value: str | int | bool | float | datetime.datetime): + """Return quoted input string.""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals + if isinstance(value, str): + # Single quoting seems to work nicer with ibis than double quoting + return f"'{escape_special_characters(value)}'" + elif isinstance(value, (bool, int)): + return str(value) + elif isinstance(value, float): + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals + if math.isnan(value): + return 'CAST("nan" as FLOAT)' + if value == math.inf: + return 'CAST("+inf" as FLOAT)' + if value == -math.inf: + return 'CAST("-inf" as FLOAT)' + return str(value) + if isinstance(value, datetime.datetime): + return f"TIMESTAMP('{value.isoformat()}')" + else: + raise ValueError(f"Cannot produce literal for {value}") + + +def multi_literal(*values: str): + literal_strings = [simple_literal(i) for i in values] + return "(" + ", ".join(literal_strings) + ")" + + +def identifier(id: str) -> str: """Return a string representing column reference in a SQL.""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers + # Just always escape, otherwise need to check against every reserved sql keyword + return f"`{escape_special_characters(id)}`" + + +def escape_special_characters(value: str): + """Escapes all special charactesrs""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + trans_table = str.maketrans( + { + "\a": r"\a", + "\b": r"\b", + "\f": r"\f", + "\n": r"\n", + "\r": r"\r", + "\t": r"\t", + "\v": r"\v", + "\\": r"\\", + "?": r"\?", + '"': r"\"", + "'": r"\'", + "`": r"\`", + } + ) + return value.translate(trans_table) + + +def cast_as_string(column_name: str) -> str: + """Return a string representing string casting of a column.""" - return f"`{column_name}`" + return f"CAST({identifier(column_name)} AS STRING)" -def cast_as_string(column_name: str): - """Return a string representing string casting of a column.""" +def csv(values: Iterable[str]) -> str: + """Return a string of comma separated values.""" + return ", ".join(values) - return f"CAST({column_reference(column_name)} AS STRING)" +def table_reference(table_ref: bigquery.TableReference) -> str: + return f"`{escape_special_characters(table_ref.project)}`.`{escape_special_characters(table_ref.dataset_id)}`.`{escape_special_characters(table_ref.table_id)}`" -def csv(values: Iterable[str], quoted=False): - """Return a string of comma separated values.""" - if quoted: - values = [quote(val) for val in values] +def infix_op(opname: str, left_arg: str, right_arg: str): + # Maybe should add parentheses?? + return f"{left_arg} {opname} {right_arg}" - return ", ".join(values) + +### Writing SELECT expressions +def select_from_subquery(columns: Iterable[str], subquery: str, distinct: bool = False): + selection = ", ".join(map(identifier, columns)) + distinct_clause = "DISTINCT " if distinct else "" + + return textwrap.dedent( + f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{subquery}\n" ")\n" + ) + + +def select_from_table_ref( + columns: Iterable[str], table_ref: bigquery.TableReference, distinct: bool = False +): + selection = ", ".join(map(identifier, columns)) + distinct_clause = "DISTINCT " if distinct else "" + + return textwrap.dedent( + f"SELECT {distinct_clause}{selection}\nFROM {table_reference(table_ref)}" + ) + + +def select_table(table_ref: bigquery.TableReference): + return textwrap.dedent(f"SELECT * FROM {table_reference(table_ref)}") + + +def is_distinct_sql(columns: Iterable[str], table_ref: bigquery.TableReference) -> str: + is_unique_sql = f"""WITH full_table AS ( + {select_from_table_ref(columns, table_ref)} + ), + distinct_table AS ( + {select_from_table_ref(columns, table_ref, distinct=True)} + ) + + SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, + (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` + """ + return is_unique_sql + + +def ordering_clause( + ordering: Iterable[bigframes.core.ordering.OrderingExpression], +) -> str: + import bigframes.core.expression + + parts = [] + for col_ref in ordering: + asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" + null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" + ordering_expr = col_ref.scalar_expression + # We don't know how to compile scalar expressions in isolation + if ordering_expr.is_const: + # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. + continue + assert isinstance( + ordering_expr, bigframes.core.expression.UnboundVariableExpression + ) + part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" + parts.append(part) + return f"ORDER BY {' ,'.join(parts)}" diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index a2851bc256..5eac4cceb9 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -21,6 +21,7 @@ import bigframes.constants as constants import bigframes.dataframe +import bigframes.dtypes import bigframes.operations as ops import bigframes.series @@ -51,25 +52,68 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - arg = bigframes.series.Series(arg) + arg = bigframes.series.Series(arg)._cached() - if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore - raise NotImplementedError( - f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}" - ) - - if format and unit and arg.dtype in ("Int64", "Float64"): # type: ignore + if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise ValueError("cannot specify both format and unit") - if unit and arg.dtype not in ("Int64", "Float64"): # type: ignore + if unit and arg.dtype not in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise NotImplementedError( f"Unit parameter is not supported for non-numerical input types. {constants.FEEDBACK_LINK}" ) - return arg._apply_unary_op( # type: ignore - ops.ToDatetimeOp( - utc=utc, - format=format, - unit=unit, + if arg.dtype in (bigframes.dtypes.TIMESTAMP_DTYPE, bigframes.dtypes.DATETIME_DTYPE): + to_type = ( + bigframes.dtypes.TIMESTAMP_DTYPE if utc else bigframes.dtypes.DATETIME_DTYPE + ) + return arg._apply_unary_op(ops.AsTypeOp(to_type=to_type)) # type: ignore + if (not utc) and arg.dtype == bigframes.dtypes.STRING_DTYPE: + if format: + raise NotImplementedError( + f"Customized formats are not supported for string inputs when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}" + ) + + assert unit is None + as_datetime = arg._apply_unary_op( # type: ignore + ops.ToDatetimeOp( + format=format, + unit=unit, + ) + ) + failed_datetime_cast = arg.notnull() & as_datetime.isnull() + is_utc = arg._apply_unary_op( + ops.EndsWithOp( + pat=("Z", "-00:00", "+00:00", "-0000", "+0000", "-00", "+00") + ) + ) + + # Cast to DATETIME shall succeed if all inputs are tz-naive. + if not failed_datetime_cast.any(): + return as_datetime + + if is_utc.all(): + return arg._apply_unary_op( # type: ignore + ops.ToTimestampOp( + format=format, + unit=unit, + ) + ) + + raise NotImplementedError( + f"Non-UTC string inputs are not supported when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}" + ) + # If utc: + elif utc: + return arg._apply_unary_op( # type: ignore + ops.ToTimestampOp( + format=format, + unit=unit, + ) + ) + else: + return arg._apply_unary_op( # type: ignore + ops.ToDatetimeOp( + format=format, + unit=unit, + ) ) - ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d3fd39afa7..105588de2f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -17,6 +17,7 @@ from __future__ import annotations import datetime +import functools import inspect import re import sys @@ -42,6 +43,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +import pandas.io.formats.format import tabulate import bigframes @@ -87,6 +89,15 @@ ) +def requires_index(meth): + @functools.wraps(meth) + def guarded_meth(df: DataFrame, *args, **kwargs): + df._throw_if_null_index(meth.__name__) + return meth(df, *args, **kwargs) + + return guarded_meth + + # Inherits from pandas DataFrame so that we can use the same docstrings. @log_adapter.class_logger class DataFrame(vendored_pandas_frame.DataFrame): @@ -244,6 +255,7 @@ def _sql_names( return results @property + @requires_index def index( self, ) -> indexes.Index: @@ -259,6 +271,7 @@ def index(self, value): self.index.name = value.name if hasattr(value, "name") else None @property + @requires_index def loc(self) -> indexers.LocDataFrameIndexer: return indexers.LocDataFrameIndexer(self) @@ -271,6 +284,7 @@ def iat(self) -> indexers.IatDataFrameIndexer: return indexers.IatDataFrameIndexer(self) @property + @requires_index def at(self) -> indexers.AtDataFrameIndexer: return indexers.AtDataFrameIndexer(self) @@ -317,10 +331,15 @@ def bqclient(self) -> bigframes.Session: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def _has_index(self) -> bool: + return len(self._block.index_columns) > 0 + @property def T(self) -> DataFrame: return DataFrame(self._get_block().transpose()) + @requires_index def transpose(self) -> DataFrame: return self.T @@ -613,7 +632,15 @@ def __repr__(self) -> str: column_count = len(pandas_df.columns) with display_options.pandas_repr(opts): - repr_string = repr(pandas_df) + import pandas.io.formats + + # safe to mutate this, this dict is owned by this code, and does not affect global config + to_string_kwargs = ( + pandas.io.formats.format.get_dataframe_repr_params() # type: ignore + ) + if not self._has_index: + to_string_kwargs.update({"index": False}) + repr_string = pandas_df.to_string(**to_string_kwargs) # Modify the end of the string to reflect count. lines = repr_string.split("\n") @@ -813,15 +840,18 @@ def _apply_dataframe_binop( ) # join columns schema # indexers will be none for exact match - columns, lcol_indexer, rcol_indexer = self.columns.join( - other.columns, how=how, return_indexers=True - ) + if self.columns.equals(other.columns): + columns, lcol_indexer, rcol_indexer = self.columns, None, None + else: + columns, lcol_indexer, rcol_indexer = self.columns.join( + other.columns, how=how, return_indexers=True + ) binop_result_ids = [] column_indices = zip( lcol_indexer if (lcol_indexer is not None) else range(len(columns)), - rcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (rcol_indexer is not None) else range(len(columns)), ) for left_index, right_index in column_indices: @@ -1329,6 +1359,7 @@ def drop( block = self._block if index is not None: + self._throw_if_null_index("drop(axis=0)") level_id = self._resolve_levels(level or 0)[0] if utils.is_list_like(index): @@ -1603,6 +1634,7 @@ def set_index( col_ids_strs: List[str] = [col_id for col_id in col_ids if col_id is not None] return DataFrame(self._block.set_index(col_ids_strs, append=append, drop=drop)) + @requires_index def sort_index( self, ascending: bool = True, na_position: Literal["first", "last"] = "last" ) -> DataFrame: @@ -1762,7 +1794,8 @@ def label_filter(label): if like: return like in label_str else: # regex - return re.match(regex, label_str) is not None + # TODO(b/340891296): fix type error + return re.match(regex, label_str) is not None # type: ignore cols = [ col_id @@ -1803,6 +1836,7 @@ def reindex( if columns is not None: return self._reindex_columns(columns) + @requires_index def _reindex_rows( self, index, @@ -1849,9 +1883,11 @@ def _reindex_columns(self, columns): result_df.columns = new_column_index return result_df + @requires_index def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None): return self.reindex(index=other.index, columns=other.columns, validate=validate) + @requires_index def interpolate(self, method: str = "linear") -> DataFrame: if method == "pad": return self.ffill() @@ -2043,14 +2079,13 @@ def quantile( if multi_q: return DataFrame(result.stack()).droplevel(0) else: - result_df = ( - DataFrame(result) - .stack(list(range(0, frame.columns.nlevels))) - .droplevel(0) + # Drop the last level, which contains q, unnecessary since only one q + result = result.with_column_labels(result.column_labels.droplevel(-1)) + result, index_col = result.create_constant(q, None) + result = result.set_index([index_col]) + return bigframes.series.Series( + result.transpose(original_row_index=pandas.Index([q])) ) - result_series = bigframes.series.Series(result_df._block) - result_series.name = q - return result_series def std( self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False @@ -2145,9 +2180,11 @@ def agg( aggregate = agg aggregate.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.agg) + @requires_index def idxmin(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmin(self._block)) + @requires_index def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) @@ -2254,6 +2291,7 @@ def _pivot( ) return DataFrame(pivot_block) + @requires_index def pivot( self, *, @@ -2267,6 +2305,7 @@ def pivot( ) -> DataFrame: return self._pivot(columns=columns, index=index, values=values) + @requires_index def pivot_table( self, values: typing.Optional[ @@ -2365,6 +2404,7 @@ def _stack_multi(self, level: LevelsType = -1): block = block.stack(levels=len(level)) return DataFrame(block) + @requires_index def unstack(self, level: LevelsType = -1): if not utils.is_list_like(level): level = [level] @@ -2612,6 +2652,7 @@ def groupby( else: raise TypeError("You have to supply one of 'by' and 'level'") + @requires_index def _groupby_level( self, level: LevelsType, @@ -3577,3 +3618,9 @@ def __matmul__(self, other) -> DataFrame: return self.dot(other) __matmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__matmul__) + + def _throw_if_null_index(self, opname: str): + if not self._has_index: + raise bigframes.exceptions.NullIndexError( + f"DataFrame cannot perform {opname} as it has no index. Set an index using set_index." + ) diff --git a/bigframes/enums.py b/bigframes/enums.py index 4bec75f5df..9501d3f13e 100644 --- a/bigframes/enums.py +++ b/bigframes/enums.py @@ -27,3 +27,5 @@ class DefaultIndexKind(enum.Enum): #: ``n - 3``, ``n - 2``, ``n - 1``, where ``n`` is the number of items in #: the index. SEQUENTIAL_INT64 = enum.auto() + # A completely null index incapable of indexing or alignment. + NULL = enum.auto() diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index eae021b4cd..027b3a4236 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -37,3 +37,7 @@ class DefaultIndexWarning(Warning): class PreviewWarning(Warning): """The feature is in preview.""" + + +class NullIndexError(ValueError): + """Object has no index.""" diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 2a7a900779..9eff802cc7 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -1023,8 +1023,9 @@ def wrapper(f): raise TypeError("f must be callable, got {}".format(f)) signature = inspect.signature(f) + # TODO(b/340898611): fix type error ibis_signature = ibis_signature_from_python_signature( - signature, input_types, output_type + signature, input_types, output_type # type: ignore ) remote_function_client = RemoteFunctionClient( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 929ccaecc5..fe9fe6df20 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -23,6 +23,7 @@ import pandas as pd import pyarrow as pa +import bigframes.dtypes import bigframes.dtypes as dtypes import bigframes.operations.type as op_typing @@ -490,7 +491,8 @@ def output_type(self, *input_types): if self.to_type == pa.string(): return dtypes.STRING_DTYPE if isinstance(self.to_type, str): - return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[self.to_type] + # TODO(b/340895446): fix type error + return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[self.to_type] # type: ignore return self.to_type @@ -512,7 +514,8 @@ class RemoteFunctionOp(UnaryOp): def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - return self.func.output_dtype + # TODO(b/340895446): fix type error + return self.func.output_dtype # type: ignore @dataclasses.dataclass(frozen=True) @@ -527,13 +530,34 @@ def output_type(self, *input_types): @dataclasses.dataclass(frozen=True) class ToDatetimeOp(UnaryOp): name: typing.ClassVar[str] = "to_datetime" - utc: bool = False format: typing.Optional[str] = None unit: typing.Optional[str] = None def output_type(self, *input_types): - timezone = "UTC" if self.utc else None - return pd.ArrowDtype(pa.timestamp("us", tz=timezone)) + if input_types[0] not in ( + bigframes.dtypes.FLOAT_DTYPE, + bigframes.dtypes.INT_DTYPE, + bigframes.dtypes.STRING_DTYPE, + ): + raise TypeError("expected string or numeric input") + return pd.ArrowDtype(pa.timestamp("us", tz=None)) + + +@dataclasses.dataclass(frozen=True) +class ToTimestampOp(UnaryOp): + name: typing.ClassVar[str] = "to_timestamp" + format: typing.Optional[str] = None + unit: typing.Optional[str] = None + + def output_type(self, *input_types): + # Must be numeric or string + if input_types[0] not in ( + bigframes.dtypes.FLOAT_DTYPE, + bigframes.dtypes.INT_DTYPE, + bigframes.dtypes.STRING_DTYPE, + ): + raise TypeError("expected string or numeric input") + return pd.ArrowDtype(pa.timestamp("us", tz="UTC")) @dataclasses.dataclass(frozen=True) @@ -605,7 +629,8 @@ class BinaryRemoteFunctionOp(BinaryOp): def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - return self.func.output_dtype + # TODO(b/340895446): fix type error + return self.func.output_dtype # type: ignore add_op = AddOp() diff --git a/bigframes/operations/_matplotlib/__init__.py b/bigframes/operations/_matplotlib/__init__.py index 02aca8cf5d..f869c1e01d 100644 --- a/bigframes/operations/_matplotlib/__init__.py +++ b/bigframes/operations/_matplotlib/__init__.py @@ -24,7 +24,8 @@ def plot(data, kind, **kwargs): - plot_obj = PLOT_CLASSES[kind](data, **kwargs) + # TODO(b/340896123): fix type error + plot_obj = PLOT_CLASSES[kind](data, **kwargs) # type: ignore plot_obj.generate() plot_obj.draw() return plot_obj.result diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 04534e20a9..78b3df1f19 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -39,7 +39,8 @@ def draw(self) -> None: @property def result(self): - return self.axes + # TODO(b/340896123): fix type error + return self.axes # type: ignore class SamplingPlot(MPLPlot): diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index c57fac4112..3b5310554b 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -38,6 +38,10 @@ def uses_total_row_ordering(self): """Whether the operator needs total row ordering. (eg. lead, lag, array_agg)""" return False + @property + def can_order_by(self): + return False + @abc.abstractmethod def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: ... @@ -115,7 +119,7 @@ class QuantileOp(UnaryAggregateOp): @property def name(self): - return f"{int(self.q*100)}%" + return f"{int(self.q * 100)}%" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -127,7 +131,7 @@ class ApproxQuartilesOp(UnaryAggregateOp): @property def name(self): - return f"{self.quartile*25}%" + return f"{self.quartile * 25}%" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if not dtypes.is_orderable(input_types[0]): @@ -222,6 +226,24 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ).output_type(input_types[0]) +@dataclasses.dataclass(frozen=True) +class ArrayAggOp(UnaryAggregateOp): + name: ClassVar[str] = "arrayagg" + + @property + def can_order_by(self): + return True + + @property + def skips_nulls(self): + return True + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_types[0])) + ) + + @dataclasses.dataclass(frozen=True) class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 75d14f3fbc..49ef7f76ee 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -14,6 +14,7 @@ from __future__ import annotations +import functools import typing from typing import List, Sequence @@ -34,6 +35,15 @@ import bigframes.session +def requires_index(meth): + @functools.wraps(meth) + def guarded_meth(df: SeriesMethods, *args, **kwargs): + df._throw_if_null_index(meth.__name__) + return meth(df, *args, **kwargs) + + return guarded_meth + + class SeriesMethods: def __init__( self, @@ -266,3 +276,9 @@ def _align_n( block, constant_col_id = block.create_constant(other, dtype=dtype) value_ids = [*value_ids, constant_col_id] return (value_ids, block) + + def _throw_if_null_index(self, opname: str): + if len(self._block.index_columns) == 0: + raise bigframes.exceptions.NullIndexError( + f"Series cannot perform {opname} as it has no index. Set an index using set_index." + ) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 8d2c0b148c..3628ecf67b 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -19,7 +19,6 @@ from collections import namedtuple from datetime import datetime import inspect -import resource import sys import typing from typing import ( @@ -70,6 +69,13 @@ import bigframes.session._io.bigquery import bigframes.session.clients +try: + import resource +except ImportError: + # resource is only available on Unix-like systems. + # https://docs.python.org/3/library/resource.html + resource = None # type: ignore + # Include method definition so that the method appears in our docs for # bigframes.pandas general functions. @@ -543,6 +549,7 @@ def read_gbq_query( max_results: Optional[int] = None, use_cache: Optional[bool] = None, col_order: Iterable[str] = (), + filters: vendored_pandas_gbq.FiltersType = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( @@ -554,6 +561,7 @@ def read_gbq_query( max_results=max_results, use_cache=use_cache, col_order=col_order, + filters=filters, ) @@ -810,12 +818,13 @@ def clean_up_by_session_id( # https://github.com/python/cpython/issues/112282 sys.setrecursionlimit(max(10000000, sys.getrecursionlimit())) -soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_STACK) -if soft_limit < hard_limit or hard_limit == resource.RLIM_INFINITY: - try: - resource.setrlimit(resource.RLIMIT_STACK, (hard_limit, hard_limit)) - except Exception: - pass +if resource is not None: + soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_STACK) + if soft_limit < hard_limit or hard_limit == resource.RLIM_INFINITY: + try: + resource.setrlimit(resource.RLIMIT_STACK, (hard_limit, hard_limit)) + except Exception: + pass # Use __all__ to let type checkers know what is part of the public API. __all___ = [ diff --git a/bigframes/series.py b/bigframes/series.py index d1fb0d679b..4595164e80 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -51,6 +51,7 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.base +from bigframes.operations.base import requires_index import bigframes.operations.datetimes as dt import bigframes.operations.plotting as plotting import bigframes.operations.strings as strings @@ -85,6 +86,7 @@ def dtypes(self): return self._dtype @property + @requires_index def loc(self) -> bigframes.core.indexers.LocSeriesIndexer: return bigframes.core.indexers.LocSeriesIndexer(self) @@ -97,6 +99,7 @@ def iat(self) -> bigframes.core.indexers.IatSeriesIndexer: return bigframes.core.indexers.IatSeriesIndexer(self) @property + @requires_index def at(self) -> bigframes.core.indexers.AtSeriesIndexer: return bigframes.core.indexers.AtSeriesIndexer(self) @@ -135,6 +138,7 @@ def values(self) -> numpy.ndarray: return self.to_numpy() @property + @requires_index def index(self) -> indexes.Index: return indexes.Index.from_frame(self) @@ -236,6 +240,7 @@ def rename( raise ValueError(f"Unsupported type of parameter index: {type(index)}") + @requires_index def rename_axis( self, mapper: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], @@ -288,7 +293,17 @@ def __repr__(self) -> str: pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) self._set_internal_query_job(query_job) - return repr(pandas_df.iloc[:, 0]) + pd_series = pandas_df.iloc[:, 0] + + import pandas.io.formats + + # safe to mutate this, this dict is owned by this code, and does not affect global config + to_string_kwargs = pandas.io.formats.format.get_series_repr_params() # type: ignore + if len(self._block.index_columns) == 0: + to_string_kwargs.update({"index": False}) + repr_string = pd_series.to_string(**to_string_kwargs) + + return repr_string def astype( self, @@ -379,10 +394,12 @@ def drop( block = block.drop_columns([condition_id]) return Series(block.select_column(self._value_column)) + @requires_index def droplevel(self, level: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(level) return Series(self._block.drop_levels(resolved_level_ids)) + @requires_index def swaplevel(self, i: int = -2, j: int = -1): level_i = self._block.index_columns[i] level_j = self._block.index_columns[j] @@ -392,6 +409,7 @@ def swaplevel(self, i: int = -2, j: int = -1): ] return Series(self._block.reorder_levels(reordering)) + @requires_index def reorder_levels(self, order: LevelsType, axis: int | str = 0): resolved_level_ids = self._resolve_levels(order) return Series(self._block.reorder_levels(resolved_level_ids)) @@ -570,6 +588,7 @@ def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): ) return Series(block.select_column(result)) + @requires_index def interpolate(self, method: str = "linear") -> Series: if method == "pad": return self.ffill() @@ -986,9 +1005,13 @@ def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> Union[Series, floa qs = tuple(q) if utils.is_list_like(q) else (q,) result = block_ops.quantile(self._block, (self._value_column,), qs=qs) if utils.is_list_like(q): - result = result.stack() - result = result.drop_levels([result.index_columns[0]]) - return Series(result) + # Drop the first level, since only one column + result = result.with_column_labels(result.column_labels.droplevel(0)) + result, index_col = result.create_constant(self.name, None) + result = result.set_index([index_col]) + return Series( + result.transpose(original_row_index=pandas.Index([self.name])) + ) else: return cast(float, Series(result).to_pandas().squeeze()) @@ -1064,6 +1087,7 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + @requires_index def unstack(self, level: LevelsType = -1): if isinstance(level, int) or isinstance(level, str): level = [level] @@ -1087,6 +1111,7 @@ def unstack(self, level: LevelsType = -1): ) return bigframes.dataframe.DataFrame(pivot_block) + @requires_index def idxmax(self) -> blocks.Label: block = self._block.order_by( [ @@ -1100,6 +1125,7 @@ def idxmax(self) -> blocks.Label: block = block.slice(0, 1) return indexes.Index(block).to_pandas()[0] + @requires_index def idxmin(self) -> blocks.Label: block = self._block.order_by( [ @@ -1209,6 +1235,7 @@ def sort_values( ) return Series(block) + @requires_index def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: # TODO(tbergeron): Support level parameter once multi-index introduced. if na_position not in ["first", "last"]: @@ -1269,6 +1296,7 @@ def groupby( else: raise TypeError("You have to supply one of 'by' and 'level'") + @requires_index def _groupby_level( self, level: int | str | typing.Sequence[int] | typing.Sequence[str], @@ -1406,9 +1434,11 @@ def combine( materialized_series = result_series._cached() return materialized_series + @requires_index def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) + @requires_index def add_suffix(self, suffix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_suffix(suffix)) @@ -1460,6 +1490,7 @@ def filter( else: raise ValueError("Need to provide 'items', 'like', or 'regex'") + @requires_index def reindex(self, index=None, *, validate: typing.Optional[bool] = None): if validate and not self.index.is_unique: raise ValueError("Original index must be unique to reindex") @@ -1488,6 +1519,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): )._block return Series(result_block) + @requires_index def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None): return self.reindex(other.index, validate=validate) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 727269e7ee..ccdc3c5eeb 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -26,6 +26,7 @@ Any, Callable, Dict, + Hashable, IO, Iterable, List, @@ -62,6 +63,7 @@ import ibis import ibis.backends.bigquery as ibis_bigquery import ibis.expr.types as ibis_types +import jellyfish import numpy as np import pandas from pandas._typing import ( @@ -78,6 +80,7 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile +import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order @@ -339,19 +342,6 @@ def read_gbq( elif col_order: columns = col_order - filters = list(filters) - if len(filters) != 0 or bf_io_bigquery.is_table_with_wildcard_suffix( - query_or_table - ): - # TODO(b/338111344): This appears to be missing index_cols, which - # are necessary to be selected. - # TODO(b/338039517): Refactor this to be called inside both - # _read_gbq_query and _read_gbq_table (after detecting primary keys) - # so we can make sure index_col/index_cols reflects primary keys. - query_or_table = bf_io_bigquery.to_query( - query_or_table, _to_index_cols(index_col), columns, filters - ) - if bf_io_bigquery.is_query(query_or_table): return self._read_gbq_query( query_or_table, @@ -361,6 +351,7 @@ def read_gbq( max_results=max_results, api_name="read_gbq", use_cache=use_cache, + filters=filters, ) else: if configuration is not None: @@ -377,6 +368,7 @@ def read_gbq( max_results=max_results, api_name="read_gbq", use_cache=use_cache if use_cache is not None else True, + filters=filters, ) def _query_to_destination( @@ -451,6 +443,7 @@ def read_gbq_query( max_results: Optional[int] = None, use_cache: Optional[bool] = None, col_order: Iterable[str] = (), + filters: third_party_pandas_gbq.FiltersType = (), ) -> dataframe.DataFrame: """Turn a SQL query into a DataFrame. @@ -517,6 +510,7 @@ def read_gbq_query( max_results=max_results, api_name="read_gbq_query", use_cache=use_cache, + filters=filters, ) def _read_gbq_query( @@ -529,6 +523,7 @@ def _read_gbq_query( max_results: Optional[int] = None, api_name: str = "read_gbq_query", use_cache: Optional[bool] = None, + filters: third_party_pandas_gbq.FiltersType = (), ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe @@ -557,6 +552,21 @@ def _read_gbq_query( index_cols = _to_index_cols(index_col) + filters = list(filters) + if len(filters) != 0 or max_results is not None: + # TODO(b/338111344): If we are running a query anyway, we might as + # well generate ROW_NUMBER() at the same time. + query = bf_io_bigquery.to_query( + query, + index_cols, + columns, + filters, + max_results=max_results, + # We're executing the query, so we don't need time travel for + # determinism. + time_travel_timestamp=None, + ) + destination, query_job = self._query_to_destination( query, index_cols, @@ -580,12 +590,14 @@ def _read_gbq_query( session=self, ) - return self.read_gbq_table( + return self._read_gbq_table( f"{destination.project}.{destination.dataset_id}.{destination.table_id}", index_col=index_col, columns=columns, - max_results=max_results, use_cache=configuration["query"]["useQueryCache"], + api_name=api_name, + # max_results and filters are omitted because they are already + # handled by to_query(), above. ) def read_gbq_table( @@ -621,24 +633,6 @@ def read_gbq_table( elif col_order: columns = col_order - filters = list(filters) - if len(filters) != 0 or bf_io_bigquery.is_table_with_wildcard_suffix(query): - # TODO(b/338039517): Refactor this to be called inside both - # _read_gbq_query and _read_gbq_table (after detecting primary keys) - # so we can make sure index_col/index_cols reflects primary keys. - query = bf_io_bigquery.to_query( - query, _to_index_cols(index_col), columns, filters - ) - - return self._read_gbq_query( - query, - index_col=index_col, - columns=columns, - max_results=max_results, - api_name="read_gbq_table", - use_cache=use_cache, - ) - return self._read_gbq_table( query=query, index_col=index_col, @@ -646,6 +640,7 @@ def read_gbq_table( max_results=max_results, api_name="read_gbq_table", use_cache=use_cache, + filters=filters, ) def _read_gbq_table( @@ -657,6 +652,7 @@ def _read_gbq_table( max_results: Optional[int] = None, api_name: str, use_cache: bool = True, + filters: third_party_pandas_gbq.FiltersType = (), ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe @@ -673,6 +669,9 @@ def _read_gbq_table( query, default_project=self.bqclient.project ) + columns = list(columns) + filters = list(filters) + # --------------------------------- # Fetch table metadata and validate # --------------------------------- @@ -684,62 +683,109 @@ def _read_gbq_table( cache=self._df_snapshot, use_cache=use_cache, ) + table_column_names = {field.name for field in table.schema} if table.location.casefold() != self._location.casefold(): raise ValueError( f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" ) - # ----------------------------------------- - # Create Ibis table expression and validate - # ----------------------------------------- - - # Use a time travel to make sure the DataFrame is deterministic, even - # if the underlying table changes. - table_expression = bf_read_gbq_table.get_ibis_time_travel_table( - self.ibis_client, - table_ref, - time_travel_timestamp, - ) - for key in columns: - if key not in table_expression.columns: + if key not in table_column_names: + possibility = min( + table_column_names, + key=lambda item: jellyfish.levenshtein_distance(key, item), + ) raise ValueError( - f"Column '{key}' of `columns` not found in this table." + f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?" ) - # --------------------------------------- - # Create a non-default index and validate - # --------------------------------------- - - # TODO(b/337925142): Move index_cols creation to before we create the - # Ibis table expression so we don't have a "SELECT *" subquery in the - # query that checks for index uniqueness. - - index_cols, is_index_unique = bf_read_gbq_table.get_index_cols_and_uniqueness( - bqclient=self.bqclient, - ibis_client=self.ibis_client, + # Converting index_col into a list of column names requires + # the table metadata because we might use the primary keys + # when constructing the index. + index_cols = bf_read_gbq_table.get_index_cols( table=table, - table_expression=table_expression, index_col=index_col, - api_name=api_name, ) for key in index_cols: - if key not in table_expression.columns: + if key not in table_column_names: + possibility = min( + table_column_names, + key=lambda item: jellyfish.levenshtein_distance(key, item), + ) raise ValueError( - f"Column `{key}` of `index_col` not found in this table." + f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?" ) - # TODO(b/337925142): We should push down column filters when we get the time - # travel table to avoid "SELECT *" subqueries. - if columns: - table_expression = table_expression.select([*index_cols, *columns]) + # ----------------------------- + # Optionally, execute the query + # ----------------------------- + + # max_results introduces non-determinism and limits the cost on + # clustered tables, so fallback to a query. We do this here so that + # the index is consistent with tables that have primary keys, even + # when max_results is set. + # TODO(b/338419730): We don't need to fallback to a query for wildcard + # tables if we allow some non-determinism when time travel isn't supported. + if max_results is not None or bf_io_bigquery.is_table_with_wildcard_suffix( + query + ): + # TODO(b/338111344): If we are running a query anyway, we might as + # well generate ROW_NUMBER() at the same time. + query = bf_io_bigquery.to_query( + query, + index_cols=index_cols, + columns=columns, + filters=filters, + max_results=max_results, + # We're executing the query, so we don't need time travel for + # determinism. + time_travel_timestamp=None, + ) + + return self._read_gbq_query( + query, + index_col=index_cols, + columns=columns, + api_name="read_gbq_table", + use_cache=use_cache, + ) + + # ----------------------------------------- + # Create Ibis table expression and validate + # ----------------------------------------- + + # Use a time travel to make sure the DataFrame is deterministic, even + # if the underlying table changes. + # TODO(b/340540991): If a dry run query fails with time travel but + # succeeds without it, omit the time travel clause and raise a warning + # about potential non-determinism if the underlying tables are modified. + table_expression = bf_read_gbq_table.get_ibis_time_travel_table( + ibis_client=self.ibis_client, + table_ref=table_ref, + index_cols=index_cols, + columns=columns, + filters=filters, + time_travel_timestamp=time_travel_timestamp, + ) # ---------------------------- # Create ordering and validate # ---------------------------- + # TODO(b/337925142): Generate a new subquery with just the index_cols + # in the Ibis table expression so we don't have a "SELECT *" subquery + # in the query that checks for index uniqueness. + # TODO(b/338065601): Provide a way to assume uniqueness and avoid this + # check. + is_index_unique = bf_read_gbq_table.are_index_cols_unique( + bqclient=self.bqclient, + table=table, + index_cols=index_cols, + api_name=api_name, + ) + if is_index_unique: array_value = bf_read_gbq_table.to_array_value_with_total_ordering( session=self, @@ -755,15 +801,28 @@ def _read_gbq_table( ) # ---------------------------------------------------- - # Create Block & default index if len(index_cols) == 0 + # Create Default Sequential Index if still have no index # ---------------------------------------------------- + # If no index columns provided or found, fall back to sequential index + if (index_col != bigframes.enums.DefaultIndexKind.NULL) and len( + index_cols + ) == 0: + index_col = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + + index_names: Sequence[Hashable] = index_cols + if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: + sequential_index_col = bigframes.core.guid.generate_guid("index_") + array_value = array_value.promote_offsets(sequential_index_col) + index_cols = [sequential_index_col] + index_names = [None] + value_columns = [col for col in array_value.column_ids if col not in index_cols] block = blocks.Block( array_value, index_columns=index_cols, column_labels=value_columns, - index_labels=index_cols, + index_labels=index_names, ) if max_results: block = block.slice(stop=max_results) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index ed1bd39ada..28eed47965 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -31,6 +31,7 @@ import bigframes from bigframes.core import log_adapter +import bigframes.core.sql import bigframes.formatting_helpers as formatting_helpers IO_ORDERING_ID = "bqdf_row_nums" @@ -336,6 +337,8 @@ def to_query( index_cols: Iterable[str], columns: Iterable[str], filters: third_party_pandas_gbq.FiltersType, + max_results: Optional[int], + time_travel_timestamp: Optional[datetime.datetime], ) -> str: """Compile query_or_table with conditions(filters, wildcards) to query.""" filters = list(filters) @@ -353,7 +356,16 @@ def to_query( else: select_clause = "SELECT *" - where_clause = "" + time_travel_clause = "" + if time_travel_timestamp is not None: + time_travel_literal = bigframes.core.sql.simple_literal(time_travel_timestamp) + time_travel_clause = f" FOR SYSTEM_TIME AS OF {time_travel_literal}" + + limit_clause = "" + if max_results is not None: + limit_clause = f" LIMIT {bigframes.core.sql.simple_literal(max_results)}" + + filter_string = "" if filters: valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = { "in": "IN", @@ -373,16 +385,15 @@ def to_query( ): filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters]) - or_expressions = [] for group in filters: if not isinstance(group, Iterable): group = [group] - and_expressions = [] + and_expression = "" for filter_item in group: if not isinstance(filter_item, tuple) or (len(filter_item) != 3): raise ValueError( - f"Filter condition should be a tuple of length 3, {filter_item} is not valid." + f"Elements of filters must be tuples of length 3, but got {repr(filter_item)}.", ) column, operator, value = filter_item @@ -397,17 +408,34 @@ def to_query( operator_str = valid_operators[operator] + column_ref = bigframes.core.sql.identifier(column) if operator_str in ["IN", "NOT IN"]: - value_list = ", ".join([repr(v) for v in value]) - expression = f"`{column}` {operator_str} ({value_list})" + value_literal = bigframes.core.sql.multi_literal(*value) else: - expression = f"`{column}` {operator_str} {repr(value)}" - and_expressions.append(expression) + value_literal = bigframes.core.sql.simple_literal(value) + expression = bigframes.core.sql.infix_op( + operator_str, column_ref, value_literal + ) + if and_expression: + and_expression = bigframes.core.sql.infix_op( + "AND", and_expression, expression + ) + else: + and_expression = expression - or_expressions.append(" AND ".join(and_expressions)) + if filter_string: + filter_string = bigframes.core.sql.infix_op( + "OR", filter_string, and_expression + ) + else: + filter_string = and_expression - if or_expressions: - where_clause = " WHERE " + " OR ".join(or_expressions) + where_clause = "" + if filter_string: + where_clause = f" WHERE {filter_string}" - full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" - return full_query + return ( + f"{select_clause} " + f"FROM {sub_query}" + f"{time_travel_clause}{where_clause}{limit_clause}" + ) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 0f6a3dadd2..063dde2a24 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -20,12 +20,12 @@ import datetime import itertools -import textwrap import typing from typing import Dict, Iterable, List, Optional, Tuple import warnings import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops +import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions import google.cloud.bigquery as bigquery import ibis @@ -40,8 +40,9 @@ import bigframes.core.compile import bigframes.core.guid as guid import bigframes.core.ordering as order +import bigframes.core.sql import bigframes.dtypes -import bigframes.session._io.bigquery.read_gbq_table +import bigframes.session._io.bigquery import bigframes.session.clients import bigframes.version @@ -125,32 +126,32 @@ def get_table_metadata( return cached_table -def _create_time_travel_sql( - table_ref: bigquery.TableReference, time_travel_timestamp: datetime.datetime -) -> str: - """Query a table via 'time travel' for consistent reads.""" - # If we have an anonymous query results table, it can't be modified and - # there isn't any BigQuery time travel. - if table_ref.dataset_id.startswith("_"): - return f"SELECT * FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}`" - - return textwrap.dedent( - f""" - SELECT * - FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}` - FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())}) - """ - ) - - def get_ibis_time_travel_table( ibis_client: ibis.BaseBackend, table_ref: bigquery.TableReference, - time_travel_timestamp: datetime.datetime, + index_cols: Iterable[str], + columns: Iterable[str], + filters: third_party_pandas_gbq.FiltersType, + time_travel_timestamp: Optional[datetime.datetime], ) -> ibis_types.Table: + # If we have an anonymous query results table, it can't be modified and + # there isn't any BigQuery time travel. + if table_ref.dataset_id.startswith("_"): + time_travel_timestamp = None + try: return ibis_client.sql( - _create_time_travel_sql(table_ref, time_travel_timestamp) + bigframes.session._io.bigquery.to_query( + f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}", + index_cols=index_cols, + columns=columns, + filters=filters, + time_travel_timestamp=time_travel_timestamp, + # If we've made it this far, we know we don't have any + # max_results to worry about, because in that case we will + # have executed a query with a LIMI clause. + max_results=None, + ) ) except google.api_core.exceptions.Forbidden as ex: # Ibis does a dry run to get the types of the columns from the SQL. @@ -159,32 +160,29 @@ def get_ibis_time_travel_table( raise -def _check_index_uniqueness( +def are_index_cols_unique( bqclient: bigquery.Client, - ibis_client: ibis.BaseBackend, - table: ibis_types.Table, + table: bigquery.table.Table, index_cols: List[str], api_name: str, ) -> bool: - distinct_table = table.select(*index_cols).distinct() - is_unique_sql = f"""WITH full_table AS ( - {ibis_client.compile(table)} - ), - distinct_table AS ( - {ibis_client.compile(distinct_table)} - ) + if len(index_cols) == 0: + return False + # If index_cols contain the primary_keys, the query engine assumes they are + # provide a unique index. + primary_keys = frozenset(_get_primary_keys(table)) + if (len(primary_keys) > 0) and primary_keys <= frozenset(index_cols): + return True - SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, - (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` - """ + # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring + # table_expression only selects just index_cols. + is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) row = next(iter(results)) - total_count = row["total_count"] - distinct_count = row["distinct_count"] - return total_count == distinct_count + return row["total_count"] == row["distinct_count"] def _get_primary_keys( @@ -228,14 +226,10 @@ def _is_table_clustered_or_partitioned( return False -def get_index_cols_and_uniqueness( - bqclient: bigquery.Client, - ibis_client: ibis.BaseBackend, +def get_index_cols( table: bigquery.table.Table, - table_expression: ibis_types.Table, index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind, - api_name: str, -) -> Tuple[List[str], bool]: +) -> List[str]: """ If we can get a total ordering from the table, such as via primary key column(s), then return those too so that ordering generation can be @@ -248,11 +242,9 @@ def get_index_cols_and_uniqueness( if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: # User has explicity asked for a default, sequential index. # Use that, even if there are primary keys on the table. - # - # Note: This relies on the default behavior of the Block - # constructor to create a default sequential index. If that ever - # changes, this logic will need to be revisited. - return [], False + return [] + if index_col == bigframes.enums.DefaultIndexKind.NULL: + return [] else: # Note: It's actually quite difficult to mock this out to unit # test, as it's not possible to subclass enums in Python. See: @@ -289,19 +281,8 @@ def get_index_cols_and_uniqueness( # columns are unique, even if the constraint is not enforced. We make # the same assumption and use these columns as the total ordering keys. index_cols = primary_keys - is_index_unique = len(index_cols) != 0 - else: - is_index_unique = _check_index_uniqueness( - bqclient=bqclient, - ibis_client=ibis_client, - # TODO(b/337925142): Avoid a "SELECT *" subquery here by using - # _create_time_travel_sql with just index_cols. - table=table_expression, - index_cols=index_cols, - api_name=api_name, - ) - return index_cols, is_index_unique + return index_cols def get_time_travel_datetime_and_table_metadata( diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 32f13fa00d..e7680d1d35 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -117,10 +117,11 @@ def __init__( def _create_bigquery_client(self): bq_options = None if self._use_regional_endpoints: + # TODO(b/340896138): fix type error bq_options = google.api_core.client_options.ClientOptions( api_endpoint=( _BIGQUERY_REGIONAL_ENDPOINT - if self._location.lower() in _REP_SUPPORTED_REGIONS + if self._location.lower() in _REP_SUPPORTED_REGIONS # type: ignore else _BIGQUERY_LOCATIONAL_ENDPOINT ).format(location=self._location), ) @@ -158,12 +159,11 @@ def bqconnectionclient(self): bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) - self._bqconnectionclient = ( - google.cloud.bigquery_connection_v1.ConnectionServiceClient( - client_info=bqconnection_info, - client_options=bqconnection_options, - credentials=self._credentials, - ) + # TODO(b/340896138): fix type error + self._bqconnectionclient = google.cloud.bigquery_connection_v1.ConnectionServiceClient( # type: ignore + client_info=bqconnection_info, + client_options=bqconnection_options, + credentials=self._credentials, ) return self._bqconnectionclient @@ -173,18 +173,20 @@ def bqstoragereadclient(self): if not self._bqstoragereadclient: bqstorage_options = None if self._use_regional_endpoints: + # TODO(b/340896138): fix type error bqstorage_options = google.api_core.client_options.ClientOptions( api_endpoint=( _BIGQUERYSTORAGE_REGIONAL_ENDPOINT - if self._location.lower() in _REP_SUPPORTED_REGIONS + if self._location.lower() in _REP_SUPPORTED_REGIONS # type: ignore else _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT ).format(location=self._location), ) bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) + # TODO(b/340896138): fix type error self._bqstoragereadclient = ( - google.cloud.bigquery_storage_v1.BigQueryReadClient( + google.cloud.bigquery_storage_v1.BigQueryReadClient( # type: ignore client_info=bqstorage_info, client_options=bqstorage_options, credentials=self._credentials, @@ -199,8 +201,9 @@ def cloudfunctionsclient(self): functions_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) + # TODO(b/340896138): fix type error self._cloudfunctionsclient = ( - google.cloud.functions_v2.FunctionServiceClient( + google.cloud.functions_v2.FunctionServiceClient( # type: ignore client_info=functions_info, credentials=self._credentials, ) @@ -214,8 +217,9 @@ def resourcemanagerclient(self): resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) + # TODO(b/340896138): fix type error self._resourcemanagerclient = ( - google.cloud.resourcemanager_v3.ProjectsClient( + google.cloud.resourcemanager_v3.ProjectsClient( # type: ignore credentials=self._credentials, client_info=resourcemanager_info ) ) diff --git a/bigframes/version.py b/bigframes/version.py index e139eaa89e..da33be63fc 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.6.0" +__version__ = "1.7.0" diff --git a/noxfile.py b/noxfile.py index af73495a7f..c816ec5f51 100644 --- a/noxfile.py +++ b/noxfile.py @@ -248,6 +248,7 @@ def mypy(session): "bigframes", os.path.join("tests", "system"), os.path.join("tests", "unit"), + "--check-untyped-defs", "--explicit-package-bases", '--exclude="^third_party"', ) diff --git a/samples/snippets/imported_onnx_model_test.py b/samples/snippets/imported_onnx_model_test.py new file mode 100644 index 0000000000..87157ee60d --- /dev/null +++ b/samples/snippets/imported_onnx_model_test.py @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_imported_sklearn_onnx_model() -> None: + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + + # [START bigquery_dataframes_imported_sklearn_onnx_tutorial_import_onnx_models] + import bigframes + from bigframes.ml.imported import ONNXModel + + bigframes.options.bigquery.project = PROJECT_ID + # You can change the location to one of the valid locations: https://cloud.google.com/bigquery/docs/locations#supported_locations + bigframes.options.bigquery.location = "US" + + imported_onnx_model = ONNXModel( + model_path="gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx" + ) + # [END bigquery_dataframes_imported_sklearn_onnx_tutorial_import_onnx_models] + assert imported_onnx_model is not None + + # [START bigquery_dataframes_imported_sklearn_onnx_tutorial_make_predictions] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.ml_datasets.iris") + predictions = imported_onnx_model.predict(df) + predictions.peek(5) + # [END bigquery_dataframes_imported_sklearn_onnx_tutorial_make_predictions] diff --git a/samples/snippets/imported_tensorflow_model_test.py b/samples/snippets/imported_tensorflow_model_test.py new file mode 100644 index 0000000000..7dcf0ffe6d --- /dev/null +++ b/samples/snippets/imported_tensorflow_model_test.py @@ -0,0 +1,44 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_imported_tensorflow_model() -> None: + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + + # [START bigquery_dataframes_imported_tensorflow_tutorial_import_tensorflow_models] + import bigframes + from bigframes.ml.imported import TensorFlowModel + + bigframes.options.bigquery.project = PROJECT_ID + # You can change the location to one of the valid locations: https://cloud.google.com/bigquery/docs/locations#supported_locations + bigframes.options.bigquery.location = "US" + + imported_tensorflow_model = TensorFlowModel( + model_path="gs://cloud-training-demos/txtclass/export/exporter/1549825580/*" + ) + # [END bigquery_dataframes_imported_tensorflow_tutorial_import_tensorflow_models] + assert imported_tensorflow_model is not None + + # [START bigquery_dataframes_imported_tensorflow_tutorial_make_predictions] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.hacker_news.full") + df_pred = df.rename(columns={"title": "input"}) + predictions = imported_tensorflow_model.predict(df_pred) + predictions.head(5) + # [END bigquery_dataframes_imported_tensorflow_tutorial_make_predictions] diff --git a/samples/snippets/logistic_regression_prediction_test.py b/samples/snippets/logistic_regression_prediction_test.py index 6a40369ba8..dd92f8f3e3 100644 --- a/samples/snippets/logistic_regression_prediction_test.py +++ b/samples/snippets/logistic_regression_prediction_test.py @@ -80,7 +80,21 @@ def test_logistic_regression_prediction(random_model_id: str) -> None: X = training_data.drop(columns=["income_bracket", "dataframe"]) y = training_data["income_bracket"] - census_model = bigframes.ml.linear_model.LogisticRegression() + census_model = bigframes.ml.linear_model.LogisticRegression( + # Balance the class labels in the training data by setting + # class_weight="balanced". + # + # By default, the training data is unweighted. If the labels + # in the training data are imbalanced, the model may learn to + # predict the most popular class of labels more heavily. In + # this case, most of the respondents in the dataset are in the + # lower income bracket. This may lead to a model that predicts + # the lower income bracket too heavily. Class weights balance + # the class labels by calculating the weights for each class in + # inverse proportion to the frequency of that class. + class_weight="balanced", + max_iterations=15, + ) census_model.fit(X, y) census_model.to_gbq( diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 1da77c1715..8fcd19bb2c 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -bigframes==1.4.0 +bigframes==1.6.0 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 4ebb3cb93b..ecf633b27f 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -391,6 +391,16 @@ def scalars_df_index( return session.read_gbq(scalars_table_id, index_col="rowindex") +@pytest.fixture(scope="session") +def scalars_df_empty_index( + scalars_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq( + scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.NULL + ).sort_values("rowindex") + + @pytest.fixture(scope="session") def scalars_df_2_default_index( scalars_df_2_index: bigframes.dataframe.DataFrame, @@ -1040,7 +1050,7 @@ def floats_pd(): dtype=pd.Float64Dtype(), ) # Index helps debug failed cases - df.index = df.float64_col + df.index = df.float64_col # type: ignore # Upload fails if index name same as column name df.index.name = None return df.float64_col @@ -1050,7 +1060,7 @@ def floats_pd(): def floats_product_pd(floats_pd): df = pd.merge(floats_pd, floats_pd, how="cross") # Index helps debug failed cases - df = df.set_index([df.float64_col_x, df.float64_col_y]) + df = df.set_index([df.float64_col_x, df.float64_col_y]) # type: ignore df.index.names = ["left", "right"] return df diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index b633ca4ea2..152fd168be 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -13,13 +13,11 @@ # limitations under the License. import pandas as pd -import pytest from bigframes.ml import cluster -from tests.system.utils import assert_pandas_df_equal +from tests.system import utils -@pytest.mark.flaky(retries=2) def test_cluster_configure_fit_score_predict( session, penguins_df_default_index, dataset_id ): @@ -88,31 +86,24 @@ def test_cluster_configure_fit_score_predict( # Check score to ensure the model was fitted score_result = model.score(new_penguins).to_pandas() - score_expected = pd.DataFrame( - {"davies_bouldin_index": [1.502182], "mean_squared_distance": [1.953408]}, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 - ) + eval_metrics = ["davies_bouldin_index", "mean_squared_distance"] + utils.check_pandas_df_schema_and_index(score_result, columns=eval_metrics, index=1) predictions = model.predict(new_penguins).to_pandas() assert predictions.shape == (4, 9) - result = predictions[["CENTROID_ID"]] - expected = pd.DataFrame( - {"CENTROID_ID": [2, 3, 1, 2]}, - dtype="Int64", - index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), + utils.check_pandas_df_schema_and_index( + predictions, + columns=["CENTROID_ID"], + index=["test1", "test2", "test3", "test4"], + col_exact=False, ) - expected.index.name = "observation" - assert_pandas_df_equal(result, expected, ignore_order=True) # save, load, check n_clusters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_cluster_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_cluster_model" in reloaded_model._bqml_model.model_name @@ -153,6 +144,7 @@ def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_cluster_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_cluster_model" in reloaded_model._bqml_model.model_name diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 7513b78b29..45322e78dd 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas - from bigframes.ml import compose, preprocessing +from tests.system import utils def test_columntransformer_standalone_fit_and_transform( @@ -45,26 +44,18 @@ def test_columntransformer_standalone_fit_and_transform( ) result = transformer.transform(new_penguins_df).to_pandas() - expected = pandas.DataFrame( - { - "onehotencoded_species": [ - [{"index": 1, "value": 1.0}], - [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], - ], - "standard_scaled_culmen_length_mm": [ - -0.811119671289163, - -0.9945520581113803, - -1.104611490204711, - ], - "min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210], - "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], - }, - index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), + utils.check_pandas_df_schema_and_index( + result, + columns=[ + "onehotencoded_species", + "standard_scaled_culmen_length_mm", + "min_max_scaled_culmen_length_mm", + "standard_scaled_flipper_length_mm", + ], + index=[1633, 1672, 1690], + col_exact=False, ) - pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) - def test_columntransformer_standalone_fit_transform(new_penguins_df): transformer = compose.ColumnTransformer( @@ -86,25 +77,17 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] ).to_pandas() - expected = pandas.DataFrame( - { - "onehotencoded_species": [ - [{"index": 1, "value": 1.0}], - [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], - ], - "standard_scaled_culmen_length_mm": [ - 1.313249, - -0.20198, - -1.111118, - ], - "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], - }, - index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), + utils.check_pandas_df_schema_and_index( + result, + columns=[ + "onehotencoded_species", + "standard_scaled_culmen_length_mm", + "standard_scaled_flipper_length_mm", + ], + index=[1633, 1672, 1690], + col_exact=False, ) - pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) - def test_columntransformer_save_load(new_penguins_df, dataset_id): transformer = compose.ColumnTransformer( @@ -147,21 +130,13 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] ).to_pandas() - expected = pandas.DataFrame( - { - "onehotencoded_species": [ - [{"index": 1, "value": 1.0}], - [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], - ], - "standard_scaled_culmen_length_mm": [ - 1.313249, - -0.20198, - -1.111118, - ], - "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], - }, - index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), + utils.check_pandas_df_schema_and_index( + result, + columns=[ + "onehotencoded_species", + "standard_scaled_culmen_length_mm", + "standard_scaled_flipper_length_mm", + ], + index=[1633, 1672, 1690], + col_exact=False, ) - - pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index aec1065e41..be5eea925f 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -12,14 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas -import pytest - from bigframes.ml import globals +from tests.system import utils -# TODO(garrettwu): Re-enable or not check exact numbers. -@pytest.mark.skip(reason="bqml regression") def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df): df = penguins_df_default_index.dropna() X_train = df[ @@ -38,41 +34,33 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d X_train, y_train, options={"model_type": "linear_reg"} ) + eval_metrics = [ + "mean_absolute_error", + "mean_squared_error", + "mean_squared_log_error", + "median_absolute_error", + "r2_score", + "explained_variance", + ] # no data - report evaluation from the automatic data split evaluate_result = model.evaluate().to_pandas() - evaluate_expected = pandas.DataFrame( - { - "mean_absolute_error": [225.817334], - "mean_squared_error": [80540.705944], - "mean_squared_log_error": [0.004972], - "median_absolute_error": [173.080816], - "r2_score": [0.87529], - "explained_variance": [0.87529], - }, - dtype="Float64", - ) - evaluate_expected = evaluate_expected.reindex( - index=evaluate_expected.index.astype("Int64") - ) - pandas.testing.assert_frame_equal( - evaluate_result, evaluate_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + evaluate_result, columns=eval_metrics, index=1 ) # evaluate on all training data evaluate_result = model.evaluate(df).to_pandas() - pandas.testing.assert_frame_equal( - evaluate_result, evaluate_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + evaluate_result, columns=eval_metrics, index=1 ) # predict new labels predictions = model.predict(new_penguins_df).to_pandas() - expected = pandas.DataFrame( - {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, - dtype="Float64", - index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pandas.testing.assert_frame_equal( - predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_body_mass_g"], + index=[1633, 1672, 1690], + col_exact=False, ) new_name = f"{dataset_id}.my_model" @@ -108,42 +96,34 @@ def test_bqml_manual_preprocessing_e2e( X_train, y_train, transforms=transforms, options=options ) + eval_metrics = [ + "mean_absolute_error", + "mean_squared_error", + "mean_squared_log_error", + "median_absolute_error", + "r2_score", + "explained_variance", + ] + # no data - report evaluation from the automatic data split evaluate_result = model.evaluate().to_pandas() - evaluate_expected = pandas.DataFrame( - { - "mean_absolute_error": [309.477334], - "mean_squared_error": [152184.227218], - "mean_squared_log_error": [0.009524], - "median_absolute_error": [257.727777], - "r2_score": [0.764356], - "explained_variance": [0.764356], - }, - dtype="Float64", - ) - evaluate_expected = evaluate_expected.reindex( - index=evaluate_expected.index.astype("Int64") - ) - - pandas.testing.assert_frame_equal( - evaluate_result, evaluate_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + evaluate_result, columns=eval_metrics, index=1 ) # evaluate on all training data evaluate_result = model.evaluate(df).to_pandas() - pandas.testing.assert_frame_equal( - evaluate_result, evaluate_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + evaluate_result, columns=eval_metrics, index=1 ) # predict new labels predictions = model.predict(new_penguins_df).to_pandas() - expected = pandas.DataFrame( - {"predicted_body_mass_g": [3968.8, 3176.3, 3545.2]}, - dtype="Float64", - index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pandas.testing.assert_frame_equal( - predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_body_mass_g"], + index=[1633, 1672, 1690], + col_exact=False, ) new_name = f"{dataset_id}.my_model" @@ -168,24 +148,9 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df): ) transformed = model.transform(new_penguins_df).to_pandas() - expected = pandas.DataFrame( - { - "scaled_culmen_length_mm": [-0.8099, -0.9931, -1.103], - "onehotencoded_species": [ - [{"index": 1, "value": 1.0}], - [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], - ], - }, - index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - expected["scaled_culmen_length_mm"] = expected["scaled_culmen_length_mm"].astype( - "Float64" - ) - pandas.testing.assert_frame_equal( - transformed[["scaled_culmen_length_mm", "onehotencoded_species"]], - expected, - check_exact=False, - rtol=0.1, - check_dtype=False, + utils.check_pandas_df_schema_and_index( + transformed, + columns=["scaled_culmen_length_mm", "onehotencoded_species"], + index=[1633, 1672, 1690], + col_exact=False, ) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 264b95a92e..49aa985189 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -15,7 +15,7 @@ import pandas as pd from bigframes.ml import decomposition -import tests.system.utils +from tests.system import utils def test_decomposition_configure_fit_score_predict( @@ -45,40 +45,26 @@ def test_decomposition_configure_fit_score_predict( # Check score to ensure the model was fitted score_result = model.score(new_penguins).to_pandas() - score_expected = pd.DataFrame( - { - "total_explained_variance_ratio": [0.812383], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=["total_explained_variance_ratio"], index=1 ) result = model.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - { - "principal_component_1": [-1.459, 2.258, -1.685], - "principal_component_2": [-1.120, -1.351, -0.874], - "principal_component_3": [-0.646, 0.443, -0.704], - }, - dtype="Float64", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - - tests.system.utils.assert_pandas_df_equal_pca( + utils.check_pandas_df_schema_and_index( result, - expected, - check_exact=False, - rtol=0.1, + columns=[ + "principal_component_1", + "principal_component_2", + "principal_component_3", + ], + index=[1633, 1672, 1690], ) # save, load, check n_components to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_pca_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_pca_model" in reloaded_model._bqml_model.model_name @@ -113,42 +99,28 @@ def test_decomposition_configure_fit_score_predict_params( # Check score to ensure the model was fitted score_result = model.score(new_penguins).to_pandas() - score_expected = pd.DataFrame( - { - "total_explained_variance_ratio": [0.932897], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=["total_explained_variance_ratio"], index=1 ) result = model.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - { - "principal_component_1": [-1.459, 2.258, -1.685], - "principal_component_2": [-1.120, -1.351, -0.874], - "principal_component_3": [-0.646, 0.443, -0.704], - "principal_component_4": [-0.539, 0.234, -0.571], - "principal_component_5": [-0.876, 0.122, 0.609], - }, - dtype="Float64", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - - tests.system.utils.assert_pandas_df_equal_pca( + utils.check_pandas_df_schema_and_index( result, - expected, - check_exact=False, - rtol=0.1, + columns=[ + "principal_component_1", + "principal_component_2", + "principal_component_3", + "principal_component_4", + "principal_component_5", + ], + index=[1633, 1672, 1690], ) # save, load, check n_components to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_pca_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_pca_model" in reloaded_model._bqml_model.model_name @@ -167,6 +139,7 @@ def test_decomposition_configure_fit_load_float_component( reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_pca_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_pca_model" in reloaded_model._bqml_model.model_name @@ -184,6 +157,7 @@ def test_decomposition_configure_fit_load_none_component( reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_pca_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_pca_model" in reloaded_model._bqml_model.model_name diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index 3d1fcaf41c..e00f7fa665 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -12,16 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from unittest import TestCase - -import pandas import pytest import bigframes.ml.ensemble +from tests.system import utils -# TODO(garrettwu): Re-enable or not check exact numbers. -@pytest.mark.skip(reason="bqml regression") @pytest.mark.flaky(retries=2) def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBRegressor() @@ -42,24 +38,15 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - expected = pandas.DataFrame( - { - "mean_absolute_error": [97.368139], - "mean_squared_error": [16284.877027], - "mean_squared_log_error": [0.0010189], - "median_absolute_error": [72.158691], - "r2_score": [0.974784], - "explained_variance": [0.974845], - }, - dtype="Float64", + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 ) - expected = expected.reindex(index=expected.index.astype("Int64")) - pandas.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_xgbregressor_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_xgbregressor_model" in reloaded_model._bqml_model.model_name @@ -103,21 +90,15 @@ def test_xgbregressor_dart_booster_multiple_params( # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - TestCase().assertSequenceEqual(result.shape, (1, 6)) - for col_name in [ - "mean_absolute_error", - "mean_squared_error", - "mean_squared_log_error", - "median_absolute_error", - "r2_score", - "explained_variance", - ]: - assert col_name in result.columns + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_xgbregressor_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_xgbregressor_model" in reloaded_model._bqml_model.model_name @@ -159,28 +140,22 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - TestCase().assertSequenceEqual(result.shape, (1, 6)) - for col_name in [ - "precision", - "recall", - "accuracy", - "f1_score", - "log_loss", - "roc_auc", - ]: - assert col_name in result.columns + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + ) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_xgbclassifier_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_xgbclassifier_model" in reloaded_model._bqml_model.model_name ) -@pytest.mark.flaky(retries=2) +# @pytest.mark.flaky(retries=2) def test_xgbclassifier_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): @@ -216,21 +191,15 @@ def test_xgbclassifier_dart_booster_multiple_params( # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - TestCase().assertSequenceEqual(result.shape, (1, 6)) - for col_name in [ - "precision", - "recall", - "accuracy", - "f1_score", - "log_loss", - "roc_auc", - ]: - assert col_name in result.columns + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + ) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_xgbclassifier_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_xgbclassifier_model" in reloaded_model._bqml_model.model_name @@ -273,21 +242,15 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - TestCase().assertSequenceEqual(result.shape, (1, 6)) - for col_name in [ - "mean_absolute_error", - "mean_squared_error", - "mean_squared_log_error", - "median_absolute_error", - "r2_score", - "explained_variance", - ]: - assert col_name in result.columns + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_randomforestregressor_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_randomforestregressor_model" in reloaded_model._bqml_model.model_name @@ -326,21 +289,15 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - TestCase().assertSequenceEqual(result.shape, (1, 6)) - for col_name in [ - "mean_absolute_error", - "mean_squared_error", - "mean_squared_log_error", - "median_absolute_error", - "r2_score", - "explained_variance", - ]: - assert col_name in result.columns + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_randomforestregressor_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_randomforestregressor_model" in reloaded_model._bqml_model.model_name @@ -379,21 +336,15 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - TestCase().assertSequenceEqual(result.shape, (1, 6)) - for col_name in [ - "precision", - "recall", - "accuracy", - "f1_score", - "log_loss", - "roc_auc", - ]: - assert col_name in result.columns + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + ) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_randomforestclassifier_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_randomforestclassifier_model" in reloaded_model._bqml_model.model_name @@ -403,7 +354,7 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase @pytest.mark.flaky(retries=2) def test_randomforestclassifier_multiple_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.RandomForestClassifier( - tree_method="AUTO", + tree_method="auto", min_tree_child_weight=2, colsample_bytree=0.95, colsample_bylevel=0.95, @@ -431,26 +382,20 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - TestCase().assertSequenceEqual(result.shape, (1, 6)) - for col_name in [ - "precision", - "recall", - "accuracy", - "f1_score", - "log_loss", - "roc_auc", - ]: - assert col_name in result.columns + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + ) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_randomforestclassifier_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_randomforestclassifier_model" in reloaded_model._bqml_model.model_name ) - assert reloaded_model.tree_method == "AUTO" + assert reloaded_model.tree_method == "auto" assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 assert reloaded_model.colsample_bynode == 0.95 diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index ef74398c2e..79deb615b1 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -12,15 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd import pytest from bigframes.ml import forecasting +from tests.system import utils ARIMA_EVALUATE_OUTPUT_COL = [ "non_seasonal_p", "non_seasonal_d", "non_seasonal_q", + "has_drift", "log_likelihood", "AIC", "variance", @@ -50,18 +51,17 @@ def test_arima_plus_model_fit_score( result = arima_model.score( new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] ).to_pandas() - expected = pd.DataFrame( - { - "mean_absolute_error": [154.742547], - "mean_squared_error": [26844.868855], - "root_mean_squared_error": [163.844038], - "mean_absolute_percentage_error": [6.189702], - "symmetric_mean_absolute_percentage_error": [6.097155], - }, - dtype="Float64", + utils.check_pandas_df_schema_and_index( + result, + columns=[ + "mean_absolute_error", + "mean_squared_error", + "root_mean_squared_error", + "mean_absolute_percentage_error", + "symmetric_mean_absolute_percentage_error", + ], + index=1, ) - expected = expected.reindex(index=expected.index.astype("Int64")) - pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load to ensure configuration was kept reloaded_model = arima_model.to_gbq( @@ -73,10 +73,10 @@ def test_arima_plus_model_fit_score( def test_arima_plus_model_fit_summary(dataset_id, arima_model): - - result = arima_model.summary() - assert result.shape == (1, 12) - assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) + result = arima_model.summary().to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=ARIMA_EVALUATE_OUTPUT_COL, index=1 + ) # save, load to ensure configuration was kept reloaded_model = arima_model.to_gbq( @@ -88,13 +88,13 @@ def test_arima_plus_model_fit_summary(dataset_id, arima_model): def test_arima_coefficients(arima_model): - got = arima_model.coef_ - expected_columns = { + result = arima_model.coef_.to_pandas() + expected_columns = [ "ar_coefficients", "ma_coefficients", "intercept_or_drift", - } - assert set(got.columns) == expected_columns + ] + utils.check_pandas_df_schema_and_index(result, columns=expected_columns, index=1) def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id): @@ -119,6 +119,7 @@ def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id): # save, load to ensure configuration was kept reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_arima_plus_model" in reloaded_model._bqml_model.model_name ) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 99121e4a31..eaf666fd50 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd - import bigframes.ml.linear_model +from tests.system import utils def test_linear_regression_configure_fit_score(penguins_df_default_index, dataset_id): @@ -36,22 +35,13 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - expected = pd.DataFrame( - { - "mean_absolute_error": [225.735767], - "mean_squared_error": [80417.461828], - "mean_squared_log_error": [0.004967], - "median_absolute_error": [172.543702], - "r2_score": [0.87548], - "explained_variance": [0.87548], - }, - dtype="Float64", + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 ) - expected = expected.reindex(index=expected.index.astype("Int64")) - pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) @@ -98,22 +88,13 @@ def test_linear_regression_customized_params_fit_score( # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - expected = pd.DataFrame( - { - "mean_absolute_error": [240], - "mean_squared_error": [91197], - "mean_squared_log_error": [0.00573], - "median_absolute_error": [197], - "r2_score": [0.858], - "explained_variance": [0.8588], - }, - dtype="Float64", + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 ) - expected = expected.reindex(index=expected.index.astype("Int64")) - pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) @@ -152,24 +133,15 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - expected = pd.DataFrame( - { - "precision": [0.616753], - "recall": [0.618615], - "accuracy": [0.92515], - "f1_score": [0.617681], - "log_loss": [1.498832], - "roc_auc": [0.975807], - }, - dtype="Float64", + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 ) - expected = expected.reindex(index=expected.index.astype("Int64")) - pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_logistic_reg_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_logistic_reg_model" in reloaded_model._bqml_model.model_name @@ -207,24 +179,15 @@ def test_logistic_regression_customized_params_fit_score( # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() - expected = pd.DataFrame( - { - "precision": [0.487], - "recall": [0.602], - "accuracy": [0.464], - "f1_score": [0.379], - "log_loss": [0.972], - "roc_auc": [0.700], - }, - dtype="Float64", + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 ) - expected = expected.reindex(index=expected.index.astype("Int64")) - pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load, check parameters to ensure configuration was kept reloaded_model = model.to_gbq( f"{dataset_id}.temp_configured_logistic_reg_model", replace=True ) + assert reloaded_model._bqml_model is not None assert ( f"{dataset_id}.temp_configured_logistic_reg_model" in reloaded_model._bqml_model.model_name diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 1a92d0f7d4..3d7eb2e426 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -24,7 +24,7 @@ pipeline, preprocessing, ) -from tests.system.utils import assert_pandas_df_equal, assert_pandas_df_equal_pca +from tests.system import utils def test_pipeline_linear_regression_fit_score_predict( @@ -51,21 +51,8 @@ def test_pipeline_linear_regression_fit_score_predict( # Check score to ensure the model was fitted score_result = pl.score(X_train, y_train).to_pandas() - score_expected = pd.DataFrame( - { - "mean_absolute_error": [309.477331], - "mean_squared_error": [152184.227219], - "mean_squared_log_error": [0.009524], - "median_absolute_error": [257.728263], - "r2_score": [0.764356], - "explained_variance": [0.764356], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=utils.ML_REGRESSION_METRICS, index=1 ) # predict new labels @@ -87,13 +74,11 @@ def test_pipeline_linear_regression_fit_score_predict( ).set_index("tag_number") ) predictions = pl.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - {"predicted_body_mass_g": [3968.8, 3176.3, 3545.2]}, - dtype="Float64", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pd.testing.assert_frame_equal( - predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_body_mass_g"], + index=[1633, 1672, 1690], + col_exact=False, ) @@ -115,21 +100,8 @@ def test_pipeline_linear_regression_series_fit_score_predict( # Check score to ensure the model was fitted score_result = pl.score(X_train, y_train).to_pandas() - score_expected = pd.DataFrame( - { - "mean_absolute_error": [528.495599], - "mean_squared_error": [421722.261808], - "mean_squared_log_error": [0.022963], - "median_absolute_error": [468.895249], - "r2_score": [0.346999], - "explained_variance": [0.346999], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=utils.ML_REGRESSION_METRICS, index=1 ) # predict new labels @@ -142,13 +114,11 @@ def test_pipeline_linear_regression_series_fit_score_predict( ).set_index("tag_number") ) predictions = pl.predict(new_penguins["culmen_length_mm"]).to_pandas() - expected = pd.DataFrame( - {"predicted_body_mass_g": [3818.845703, 3732.022253, 3679.928123]}, - dtype="Float64", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pd.testing.assert_frame_equal( - predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_body_mass_g"], + index=[1633, 1672, 1690], + col_exact=False, ) @@ -176,21 +146,8 @@ def test_pipeline_logistic_regression_fit_score_predict( # Check score to ensure the model was fitted score_result = pl.score(X_train, y_train).to_pandas() - score_expected = pd.DataFrame( - { - "precision": [0.537091], - "recall": [0.538636], - "accuracy": [0.805389], - "f1_score": [0.537716], - "log_loss": [1.445433], - "roc_auc": [0.917818], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=utils.ML_CLASSFICATION_METRICS, index=1 ) # predict new labels @@ -211,19 +168,14 @@ def test_pipeline_logistic_regression_fit_score_predict( ).set_index("tag_number") ) predictions = pl.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - {"predicted_sex": ["MALE", "FEMALE", "FEMALE"]}, - dtype=pd.StringDtype(storage="pyarrow"), - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pd.testing.assert_frame_equal( - predictions[["predicted_sex"]], - expected, + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_sex"], + index=[1633, 1672, 1690], + col_exact=False, ) -# TODO(garrettwu): Re-enable or not check exact numbers. -@pytest.mark.skip(reason="bqml regression") @pytest.mark.flaky(retries=2) def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_index): """Test a supervised model with a minimal preprocessing step""" @@ -247,21 +199,8 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in # Check score to ensure the model was fitted score_result = pl.score(X_train, y_train).to_pandas() - score_expected = pd.DataFrame( - { - "mean_absolute_error": [202.298434], - "mean_squared_error": [74515.108971], - "mean_squared_log_error": [0.004365], - "median_absolute_error": [142.949219], - "r2_score": [0.88462], - "explained_variance": [0.886454], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=utils.ML_REGRESSION_METRICS, index=1 ) # predict new labels @@ -283,24 +222,14 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in ).set_index("tag_number") ) predictions = pl.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - { - "predicted_body_mass_g": [ - 4287.34521484375, - 3198.351806640625, - 3385.34130859375, - ] - }, - dtype="Float64", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pd.testing.assert_frame_equal( - predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_body_mass_g"], + index=[1633, 1672, 1690], + col_exact=False, ) -# TODO(garrettwu): Re-enable or not check exact numbers. -@pytest.mark.skip(reason="bqml regression") @pytest.mark.flaky(retries=2) def test_pipeline_random_forest_classifier_fit_score_predict( session, penguins_df_default_index @@ -326,21 +255,8 @@ def test_pipeline_random_forest_classifier_fit_score_predict( # Check score to ensure the model was fitted score_result = pl.score(X_train, y_train).to_pandas() - score_expected = pd.DataFrame( - { - "precision": [0.585505], - "recall": [0.58676], - "accuracy": [0.877246], - "f1_score": [0.585657], - "log_loss": [0.880643], - "roc_auc": [0.970697], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=utils.ML_CLASSFICATION_METRICS, index=1 ) # predict new labels @@ -361,14 +277,11 @@ def test_pipeline_random_forest_classifier_fit_score_predict( ).set_index("tag_number") ) predictions = pl.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - {"predicted_sex": ["MALE", "FEMALE", "FEMALE"]}, - dtype=pd.StringDtype(storage="pyarrow"), - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pd.testing.assert_frame_equal( - predictions[["predicted_sex"]], - expected, + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_sex"], + index=[1633, 1672, 1690], + col_exact=False, ) @@ -412,40 +325,20 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): # Check score to ensure the model was fitted score_result = pl.score(new_penguins).to_pandas() - score_expected = pd.DataFrame( - { - "total_explained_variance_ratio": [1.0], - }, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + score_result, columns=["total_explained_variance_ratio"], index=1 ) predictions = pl.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - { - "principal_component_1": [-1.115259, -1.506141, -1.471173], - "principal_component_2": [-0.074825, 0.69664, 0.406103], - "principal_component_3": [0.500013, -0.544479, 0.075849], - }, - dtype="Float64", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - - assert_pandas_df_equal_pca( - predictions[ - [ - "principal_component_1", - "principal_component_2", - "principal_component_3", - ] + utils.check_pandas_df_schema_and_index( + predictions, + columns=[ + "principal_component_1", + "principal_component_2", + "principal_component_3", ], - expected, - check_exact=False, - rtol=0.1, + index=[1633, 1672, 1690], + col_exact=False, ) @@ -538,29 +431,16 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict( # Check score to ensure the model was fitted score_result = pl.score(new_penguins).to_pandas() - score_expected = pd.DataFrame( - {"davies_bouldin_index": [7.542981], "mean_squared_distance": [94.692409]}, - dtype="Float64", - ) - score_expected = score_expected.reindex(index=score_expected.index.astype("Int64")) - - pd.testing.assert_frame_equal( - score_result, score_expected, check_exact=False, rtol=0.1 - ) + eval_metrics = ["davies_bouldin_index", "mean_squared_distance"] + utils.check_pandas_df_schema_and_index(score_result, columns=eval_metrics, index=1) predictions = pl.predict(new_penguins).to_pandas().sort_index() - assert predictions.shape == (6, 9) - result = predictions[["CENTROID_ID"]] - expected = pd.DataFrame( - {"CENTROID_ID": [1, 2, 1, 2, 1, 2]}, - dtype="Int64", - index=pd.Index( - ["test1", "test2", "test3", "test4", "test5", "test6"], - dtype="string[pyarrow]", - ), + utils.check_pandas_df_schema_and_index( + predictions, + columns=["CENTROID_ID"], + index=["test1", "test2", "test3", "test4", "test5", "test6"], + col_exact=False, ) - expected.index.name = "observation" - assert_pandas_df_equal(result, expected, ignore_order=True) def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index): @@ -632,13 +512,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind ).set_index("tag_number") ) predictions = pl.predict(new_penguins).to_pandas() - expected = pd.DataFrame( - {"predicted_body_mass_g": [3909.2, 3436.0, 2860.0]}, - dtype="Float64", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - pd.testing.assert_frame_equal( - predictions[["predicted_body_mass_g"]], expected, check_exact=False, rtol=0.1 + utils.check_pandas_df_schema_and_index( + predictions, + columns=["predicted_body_mass_g"], + index=[1633, 1672, 1690], + col_exact=False, ) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 0fa1d90e8b..cac8483b5b 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -92,7 +92,8 @@ def make_uniq_udf(udf): target_code = source_code.replace(source_key, target_key, 1) f.write(target_code) spec = importlib.util.spec_from_file_location(udf_file_name, udf_file_path) - udf_uniq = getattr(spec.loader.load_module(), udf_uniq_name) + # TODO(b/340875260): fix type error + udf_uniq = getattr(spec.loader.load_module(), udf_uniq_name) # type: ignore # This is a bit of a hack but we need to remove the reference to a foreign # module, otherwise the serialization would keep the foreign module @@ -221,7 +222,7 @@ def stringify(x): ) -# @pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2, delay=120) def test_remote_function_binop(session, scalars_dfs, dataset_id, bq_cf_connection): try: diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index 871c846c79..cf1c787a58 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -80,7 +80,8 @@ def test_to_pandas_batches_large_table(): expected_row_count, expected_column_count = df.shape row_count = 0 - for df in df.to_pandas_batches(): + # TODO(b/340890167): fix type error + for df in df.to_pandas_batches(): # type: ignore batch_row_count, batch_column_count = df.shape assert batch_column_count == expected_column_count row_count += batch_row_count @@ -92,12 +93,14 @@ def test_to_pandas_batches_large_table(): assert row_count == expected_row_count +@pytest.mark.skip(reason="See if it caused kokoro build aborted.") def test_to_pandas_large_table(): df = bpd.read_gbq("load_testing.scalars_10gb") # df will be downloaded locally expected_row_count, expected_column_count = df.shape - df = df.to_pandas() + # TODO(b/340893653): fix type error + df = df.to_pandas() # type: ignore row_count, column_count = df.shape assert column_count == expected_column_count assert row_count == expected_row_count diff --git a/tests/system/small/bigquery/test_array.py b/tests/system/small/bigquery/test_array.py index a91669cd88..0664c31a3c 100644 --- a/tests/system/small/bigquery/test_array.py +++ b/tests/system/small/bigquery/test_array.py @@ -14,6 +14,7 @@ import numpy as np import pandas as pd +import pytest import bigframes.bigquery as bbq import bigframes.pandas as bpd @@ -23,10 +24,118 @@ def test_array_length(): series = bpd.Series([["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]]) # TODO(b/336880368): Allow for NULL values to be input for ARRAY columns. # Once we actually store NULL values, this will be NULL where the input is NULL. - expected = pd.Series([3, 2, 0, 0, 1]) + expected = bpd.Series([3, 2, 0, 0, 1]) pd.testing.assert_series_equal( bbq.array_length(series).to_pandas(), - expected, - check_dtype=False, - check_index_type=False, + expected.to_pandas(), + ) + + +@pytest.mark.parametrize( + ("input_data", "output_data"), + [ + pytest.param([1, 2, 3, 4, 5], [[1, 2], [3, 4], [5]], id="ints"), + pytest.param( + ["e", "d", "c", "b", "a"], + [["e", "d"], ["c", "b"], ["a"]], + id="reverse_strings", + ), + pytest.param( + [1.0, 2.0, np.nan, np.nan, np.nan], [[1.0, 2.0], [], []], id="nans" + ), + pytest.param( + [{"A": {"x": 1.0}}, {"A": {"z": 4.0}}, {}, {"B": "b"}, np.nan], + [[{"A": {"x": 1.0}}, {"A": {"z": 4.0}}], [{}, {"B": "b"}], []], + id="structs", + ), + ], +) +def test_array_agg_w_series_groupby(input_data, output_data): + input_index = ["a", "a", "b", "b", "c"] + series = bpd.Series(input_data, index=input_index) + result = bbq.array_agg(series.groupby(level=0)) + + expected = bpd.Series(output_data, index=["a", "b", "c"]) + pd.testing.assert_series_equal( + result.to_pandas(), # type: ignore + expected.to_pandas(), + ) + + +def test_array_agg_w_dataframe_groupby(): + data = { + "a": [1, 1, 2, 1], + "b": [2, None, 1, 2], + "c": [3, 4, 3, 2], + } + df = bpd.DataFrame(data) + result = bbq.array_agg(df.groupby(by=["b"])) + + expected_data = { + "b": [1.0, 2.0], + "a": [[2], [1, 1]], + "c": [[3], [3, 2]], + } + expected = bpd.DataFrame(expected_data).set_index("b") + + pd.testing.assert_frame_equal( + result.to_pandas(), # type: ignore + expected.to_pandas(), + ) + + +def test_array_agg_w_series(): + series = bpd.Series([1, 2, 3, 4, 5], index=["a", "a", "b", "b", "c"]) + # Mypy error expected: array_agg currently incompatible with Series. + # Test for coverage. + with pytest.raises(ValueError): + bbq.array_agg(series) # type: ignore + + +@pytest.mark.parametrize( + ("ascending", "expected_b", "expected_c"), + [ + pytest.param( + True, [["a", "b"], ["e", "d", "c"]], [[4, 5], [1, 2, 3]], id="asc" + ), + pytest.param( + False, [["b", "a"], ["c", "d", "e"]], [[5, 4], [3, 2, 1]], id="des" + ), + ], +) +def test_array_agg_reserve_order(ascending, expected_b, expected_c): + data = { + "a": [1, 1, 2, 2, 2], + "b": ["a", "b", "c", "d", "e"], + "c": [4, 5, 3, 2, 1], + } + df = bpd.DataFrame(data) + + result = bbq.array_agg(df.sort_values("c", ascending=ascending).groupby(by=["a"])) + expected_data = { + "a": [1, 2], + "b": expected_b, + "c": expected_c, + } + expected = bpd.DataFrame(expected_data).set_index("a") + + pd.testing.assert_frame_equal( + result.to_pandas(), # type: ignore + expected.to_pandas(), + ) + + +def test_array_agg_matches_after_explode(): + data = { + "index": np.arange(10), + "a": [np.random.randint(0, 10, 10) for _ in range(10)], + "b": [np.random.randint(0, 10, 10) for _ in range(10)], + } + df = bpd.DataFrame(data).set_index("index") + result = bbq.array_agg(df.explode(["a", "b"]).groupby(level=0)) + result.index.name = "index" + + pd.testing.assert_frame_equal( + result.to_pandas(), # type: ignore + df.to_pandas(), ) diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index c4c7eb4b88..9aff2fe773 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -228,12 +228,14 @@ def test_roc_curve_binary_classification_prediction_matches_sklearn(session): np_thresholds[1:], expected_thresholds[1:], ) + # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_fpr, + np_fpr, # type: ignore expected_fpr, ) + # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_tpr, + np_tpr, # type: ignore expected_tpr, ) @@ -323,12 +325,14 @@ def test_roc_curve_binary_classification_decision_matches_sklearn(session): np_thresholds[1:], expected_thresholds[1:], ) + # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_fpr, + np_fpr, # type: ignore expected_fpr, ) + # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_tpr, + np_tpr, # type: ignore expected_tpr, ) @@ -515,8 +519,9 @@ def test_confusion_matrix_column_index(session): ).astype("Int64") df = session.read_pandas(pd_df) confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) + # TODO(b/340872435): fix type error expected_pd_df = ( - pd.DataFrame( + pd.DataFrame( # type: ignore {1: [1, 0, 1, 0], 2: [0, 0, 2, 0], 3: [0, 0, 0, 0], 4: [0, 1, 0, 1]} ) .astype("int64") @@ -557,7 +562,8 @@ def test_confusion_matrix_str_matches_sklearn(session): expected_confusion_matrix = sklearn_metrics.confusion_matrix( pd_df[["y_true"]], pd_df[["y_pred"]] ) - expected_pd_df = pd.DataFrame(expected_confusion_matrix).set_index( + # TODO(b/340872435): fix type error + expected_pd_df = pd.DataFrame(expected_confusion_matrix).set_index( # type: ignore [pd.Index(["ant", "bird", "cat"])] ) expected_pd_df.columns = pd.Index(["ant", "bird", "cat"]) @@ -595,8 +601,9 @@ def test_recall_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) + # TODO(b/340872435): fix type error recall = metrics.recall_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None # type: ignore ) expected_values = [1.000000, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -613,7 +620,8 @@ def test_recall_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) + # TODO(b/340872435): fix type error + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -630,7 +638,8 @@ def test_recall_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) + # TODO(b/340872435): fix type error + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -647,7 +656,8 @@ def test_recall_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = metrics.recall_score(df["y_true"], df["y_pred"], average=None) + # TODO(b/340872435): fix type error + recall = metrics.recall_score(df["y_true"], df["y_pred"], average=None) # type: ignore expected_values = [1.000000, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_recall = pd.Series(expected_values, index=expected_index) @@ -663,8 +673,9 @@ def test_precision_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) + # TODO(b/340872435): fix type error precision_score = metrics.precision_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None # type: ignore ) expected_values = [0.666667, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -683,8 +694,9 @@ def test_precision_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) + # TODO(b/340872435): fix type error precision_score = metrics.precision_score( - df[["y_true"]], df[["y_pred"]], average=None + df[["y_true"]], df[["y_pred"]], average=None # type: ignore ) expected_values = sklearn_metrics.precision_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None @@ -704,8 +716,9 @@ def test_precision_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) + # TODO(b/340872435): fix type error precision_score = metrics.precision_score( - df[["y_true"]], df[["y_pred"]], average=None + df[["y_true"]], df[["y_pred"]], average=None # type: ignore ) expected_values = sklearn_metrics.precision_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None @@ -725,7 +738,8 @@ def test_precision_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - precision_score = metrics.precision_score(df["y_true"], df["y_pred"], average=None) + # TODO(b/340872435): fix type error + precision_score = metrics.precision_score(df["y_true"], df["y_pred"], average=None) # type: ignore expected_values = [0.666667, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_precision = pd.Series(expected_values, index=expected_index) @@ -743,8 +757,9 @@ def test_f1_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) + # TODO(b/340872435): fix type error f1_score = metrics.f1_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None # type: ignore ) expected_values = [0.8, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -761,7 +776,8 @@ def test_f1_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) + # TODO(b/340872435): fix type error + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -778,7 +794,8 @@ def test_f1_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) + # TODO(b/340872435): fix type error + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -795,7 +812,8 @@ def test_f1_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = metrics.f1_score(df["y_true"], df["y_pred"], average=None) + # TODO(b/340872435): fix type error + f1_score = metrics.f1_score(df["y_true"], df["y_pred"], average=None) # type: ignore expected_values = [0.8, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_f1 = pd.Series(expected_values, index=expected_index) diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index 9eb3645591..ca14186a4d 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -130,12 +130,17 @@ def test_train_test_split_seeded_correct_rows( X, y, random_state=42 ) - X_train = X_train.to_pandas().sort_index() - X_test = X_test.to_pandas().sort_index() - y_train = y_train.to_pandas().sort_index() - y_test = y_test.to_pandas().sort_index() - - train_index = pd.Index( + # TODO(b/340876926): fix type error + X_train = X_train.to_pandas().sort_index() # type: ignore + # TODO(b/340876926): fix type error + X_test = X_test.to_pandas().sort_index() # type: ignore + # TODO(b/340876926): fix type error + y_train = y_train.to_pandas().sort_index() # type: ignore + # TODO(b/340876926): fix type error + y_test = y_test.to_pandas().sort_index() # type: ignore + + # TODO(b/340876926): fix type error + train_index = pd.Index( # type: ignore [ 144, 146, @@ -162,13 +167,15 @@ def test_train_test_split_seeded_correct_rows( dtype="Int64", name="rowindex", ) - test_index = pd.Index( + # TODO(b/340876926): fix type error + test_index = pd.Index( # type: ignore [148, 161, 226, 269, 278, 289, 291], dtype="Int64", name="rowindex" ) all_data.index.name = "_" + # TODO(b/340876926): fix type error pd.testing.assert_frame_equal( - X_train, + X_train, # type: ignore all_data[ [ "species", @@ -177,8 +184,9 @@ def test_train_test_split_seeded_correct_rows( ] ].loc[train_index], ) + # TODO(b/340876926): fix type error pd.testing.assert_frame_equal( - X_test, + X_test, # type: ignore all_data[ [ "species", @@ -187,16 +195,18 @@ def test_train_test_split_seeded_correct_rows( ] ].loc[test_index], ) + # TODO(b/340876926): fix type error pd.testing.assert_frame_equal( - y_train, + y_train, # type: ignore all_data[ [ "body_mass_g", ] ].loc[train_index], ) + # TODO(b/340876926): fix type error pd.testing.assert_frame_equal( - y_test, + y_test, # type: ignore all_data[ [ "body_mass_g", diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 2824e86979..838bc11108 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -310,7 +310,8 @@ def test_dt_floor(scalars_dfs, col_name, freq): def test_dt_compare_coerce_str_datetime(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df["datetime_col"] - bf_result = (bf_series >= "2024-01-01").to_pandas() + # TODO(b/340878286): fix type error + bf_result = (bf_series >= "2024-01-01").to_pandas() # type: ignore pd_result = scalars_pandas_df["datetime_col"] >= pd.to_datetime("2024-01-01") diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index faf7cb7e6b..e0ef84641c 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -258,8 +258,9 @@ def test_scatter_args_s(s): ax = df.plot.scatter(x="a", y="b", s="s") pd_ax = pd_df.plot.scatter(x="a", y="b", s="s") + # TODO(b/340891723): fix type error tm.assert_numpy_array_equal( - ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() + ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() # type: ignore ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5ed6908640..4b50922c0d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -147,7 +147,8 @@ def test_df_construct_inline_respects_location(): df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) repr(df) - table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) + # TODO(b/340876936): fix type error + table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) # type: ignore assert table.location == "europe-west1" @@ -752,8 +753,10 @@ def test_assign_listlike_to_empty_df(session): def test_assign_to_empty_df_multiindex_error(session): empty_df = dataframe.DataFrame(session=session) empty_pandas_df = pd.DataFrame() - empty_df["empty_col_1"] = [] - empty_df["empty_col_2"] = [] + # TODO(b/340876936): fix type error + empty_df["empty_col_1"] = [] # type: ignore + # TODO(b/340876936): fix type error + empty_df["empty_col_2"] = [] # type: ignore empty_pandas_df["empty_col_1"] = [] empty_pandas_df["empty_col_2"] = [] empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) @@ -1341,20 +1344,34 @@ def test_get_dtypes(scalars_df_default_index): dtypes, pd.Series( { - "bool_col": pd.BooleanDtype(), - "bytes_col": pd.ArrowDtype(pa.binary()), - "date_col": pd.ArrowDtype(pa.date32()), - "datetime_col": pd.ArrowDtype(pa.timestamp("us")), - "geography_col": gpd.array.GeometryDtype(), - "int64_col": pd.Int64Dtype(), - "int64_too": pd.Int64Dtype(), - "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), - "float64_col": pd.Float64Dtype(), - "rowindex": pd.Int64Dtype(), - "rowindex_2": pd.Int64Dtype(), - "string_col": pd.StringDtype(storage="pyarrow"), - "time_col": pd.ArrowDtype(pa.time64("us")), - "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), + # TODO(b/340876936): fix type error + "bool_col": pd.BooleanDtype(), # type: ignore + # TODO(b/340876936): fix type error + "bytes_col": pd.ArrowDtype(pa.binary()), # type: ignore + # TODO(b/340876936): fix type error + "date_col": pd.ArrowDtype(pa.date32()), # type: ignore + # TODO(b/340876936): fix type error + "datetime_col": pd.ArrowDtype(pa.timestamp("us")), # type: ignore + # TODO(b/340876936): fix type error + "geography_col": gpd.array.GeometryDtype(), # type: ignore + # TODO(b/340876936): fix type error + "int64_col": pd.Int64Dtype(), # type: ignore + # TODO(b/340876936): fix type error + "int64_too": pd.Int64Dtype(), # type: ignore + # TODO(b/340876936): fix type error + "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), # type: ignore + # TODO(b/340876936): fix type error + "float64_col": pd.Float64Dtype(), # type: ignore + # TODO(b/340876936): fix type error + "rowindex": pd.Int64Dtype(), # type: ignore + # TODO(b/340876936): fix type error + "rowindex_2": pd.Int64Dtype(), # type: ignore + # TODO(b/340876936): fix type error + "string_col": pd.StringDtype(storage="pyarrow"), # type: ignore + # TODO(b/340876936): fix type error + "time_col": pd.ArrowDtype(pa.time64("us")), # type: ignore + # TODO(b/340876936): fix type error + "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), # type: ignore } ), ) @@ -1784,8 +1801,11 @@ def test_combine( def test_df_update(overwrite, filter_func): if pd.__version__.startswith("1."): pytest.skip("dtype handled differently in pandas 1.x.") - index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") - index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") + + # TODO(b/340876936): fix type error + index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") # type: ignore + # TODO(b/340876936): fix type error + index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") # type: ignore pd_df1 = pandas.DataFrame( {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 ) @@ -1845,8 +1865,10 @@ def test_df_idxmax(): ], ) def test_df_align(join, axis): - index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") - index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") + # TODO(b/340876936): fix type error + index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") # type: ignore + # TODO(b/340876936): fix type error + index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") # type: ignore pd_df1 = pandas.DataFrame( {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 ) @@ -1863,8 +1885,10 @@ def test_df_align(join, axis): pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) # Don't check dtype as pandas does unnecessary float conversion - pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) - pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + # TODO(b/340876936): fix type error + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) # type: ignore + # TODO(b/340876936): fix type error + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) # type: ignore def test_combine_first( @@ -2500,9 +2524,11 @@ def test_df_transpose(): # Include some floats to ensure type coercion values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] # Test complex case of both axes being multi-indices with non-unique elements - columns = pd.Index(["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow")) + # TODO(b/340876936): fix type error + columns = pd.Index(["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow")) # type: ignore columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) - index = pd.Index(["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow")) + # TODO(b/340876936): fix type error + index = pd.Index(["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow")) # type: ignore rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) @@ -3670,8 +3696,10 @@ def test_df_setattr_index(): [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] ) bf_df = dataframe.DataFrame(pd_df) - pd_df.index = [4, 5] - bf_df.index = [4, 5] + # TODO(b/340876936): fix type error + pd_df.index = [4, 5] # type: ignore + # TODO(b/340876936): fix type error + bf_df.index = [4, 5] # type: ignore assert_pandas_df_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False @@ -3683,8 +3711,10 @@ def test_df_setattr_columns(): [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] ) bf_df = dataframe.DataFrame(pd_df) - pd_df.columns = [4, 5, 6] - bf_df.columns = [4, 5, 6] + # TODO(b/340876936): fix type error + pd_df.columns = [4, 5, 6] # type: ignore + # TODO(b/340876936): fix type error + bf_df.columns = [4, 5, 6] # type: ignore assert_pandas_df_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False @@ -3778,7 +3808,8 @@ def test_iloc_list_multiindex(scalars_dfs): def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): - index_list = [] + # TODO(b/340876936): fix type error + index_list = [] # type: ignore bf_result = scalars_df_index.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] diff --git a/tests/system/small/test_empty_index.py b/tests/system/small/test_empty_index.py new file mode 100644 index 0000000000..7a1715e3d1 --- /dev/null +++ b/tests/system/small/test_empty_index.py @@ -0,0 +1,212 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pandas as pd +import pytest + +import bigframes.exceptions +import bigframes.pandas as bpd +from tests.system.utils import skip_legacy_pandas + + +def test_empty_index_materialize( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index.to_pandas() + pd.testing.assert_frame_equal( + bf_result, scalars_pandas_df_default_index, check_index_type=False + ) + + +def test_empty_index_series_repr( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index["int64_too"].head(5).__repr__() + pd_result = ( + scalars_pandas_df_default_index["int64_too"] + .head(5) + .to_string(dtype=True, index=False, length=False, name=True) + ) + assert bf_result == pd_result + + +def test_empty_index_dataframe_repr( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index[["int64_too", "int64_col"]].head(5).__repr__() + pd_result = ( + scalars_pandas_df_default_index[["int64_too", "int64_col"]] + .head(5) + .to_string(index=False) + ) + assert bf_result == pd_result + "\n\n[5 rows x 2 columns]" + + +def test_empty_index_reset_index( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index.reset_index().to_pandas() + pd_result = scalars_pandas_df_default_index.reset_index(drop=True) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_empty_index_set_index(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = scalars_df_empty_index.set_index("int64_col").to_pandas() + pd_result = scalars_pandas_df_default_index.set_index("int64_col") + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_empty_index_concat(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = bpd.concat( + [scalars_df_empty_index, scalars_df_empty_index], axis=0 + ).to_pandas() + pd_result = pd.concat( + [scalars_pandas_df_default_index, scalars_pandas_df_default_index], axis=0 + ) + pd.testing.assert_frame_equal(bf_result, pd_result.reset_index(drop=True)) + + +def test_empty_index_aggregate(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = scalars_df_empty_index.count().to_pandas() + pd_result = scalars_pandas_df_default_index.count() + + pd_result.index = pd_result.index.astype("string[pyarrow]") + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_empty_index_groupby_aggregate( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_empty_index.groupby("int64_col").count().to_pandas() + pd_result = scalars_pandas_df_default_index.groupby("int64_col").count() + + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@skip_legacy_pandas +def test_empty_index_analytic(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = scalars_df_empty_index["int64_col"].cumsum().to_pandas() + pd_result = scalars_pandas_df_default_index["int64_col"].cumsum() + pd.testing.assert_series_equal( + bf_result, pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_empty_index_groupby_analytic( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_empty_index.groupby("bool_col")["int64_col"].cummax().to_pandas() + ) + pd_result = scalars_pandas_df_default_index.groupby("bool_col")[ + "int64_col" + ].cummax() + pd.testing.assert_series_equal( + bf_result, pd_result.reset_index(drop=True), check_dtype=False + ) + + +@skip_legacy_pandas +def test_empty_index_stack(scalars_df_empty_index, scalars_pandas_df_default_index): + stacking_cols = ["int64_col", "int64_too"] + bf_result = scalars_df_empty_index[stacking_cols].stack().to_pandas() + pd_result = ( + scalars_pandas_df_default_index[stacking_cols] + .stack(future_stack=True) + .droplevel(level=0, axis=0) + ) + pd_result.index = pd_result.index.astype(bf_result.index.dtype) + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + ) + + +def test_empty_index_series_self_aligns( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_empty_index["int64_col"] + scalars_df_empty_index["int64_too"] + ) + pd_result = ( + scalars_pandas_df_default_index["int64_col"] + + scalars_pandas_df_default_index["int64_too"] + ) + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_empty_index_df_self_aligns( + scalars_df_empty_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_empty_index[["int64_col", "float64_col"]] + + scalars_df_empty_index[["int64_col", "float64_col"]] + ) + pd_result = ( + scalars_pandas_df_default_index[["int64_col", "float64_col"]] + + scalars_pandas_df_default_index[["int64_col", "float64_col"]] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_empty_index_df_concat(scalars_df_empty_index, scalars_pandas_df_default_index): + bf_result = bpd.concat([scalars_df_empty_index, scalars_df_empty_index]) + pd_result = pd.concat( + [scalars_pandas_df_default_index, scalars_pandas_df_default_index] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + +def test_empty_index_align_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = ( + scalars_df_empty_index["int64_col"] + + scalars_df_empty_index["int64_col"].cumsum() + ) + + +def test_empty_index_loc_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_empty_index["int64_col"].loc[1] + + +def test_empty_index_at_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_empty_index["int64_col"].at[1] + + +def test_empty_index_idxmin_error(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + scalars_df_empty_index[["int64_col", "int64_too"]].idxmin() + + +def test_empty_index_index_property(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = scalars_df_empty_index.index + + +def test_empty_index_transpose(scalars_df_empty_index): + with pytest.raises(bigframes.exceptions.NullIndexError): + _ = scalars_df_empty_index.T diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index fcaca7a493..088211d7fc 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -242,11 +242,14 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): model.fit(X_train, y_train) assert model is not None - assert model._bqml_model.model.encryption_configuration is not None - assert model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek + # TODO(b/340879287): fix type error + assert model._bqml_model.model.encryption_configuration is not None # type: ignore + # TODO(b/340879287): fix type error + assert model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek # type: ignore # Assert that model exists in BQ with intended encryption - model_bq = session_with_bq_cmek.bqclient.get_model(model._bqml_model.model_name) + # TODO(b/340879287): fix type error + model_bq = session_with_bq_cmek.bqclient.get_model(model._bqml_model.model_name) # type: ignore assert model_bq.encryption_configuration.kms_key_name == bq_cmek # Explicitly save the model to a destination and assert that encryption holds @@ -257,10 +260,12 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): f"{model_ref.project}.{model_ref.dataset_id}.{model_ref.model_id}" ) new_model = model.to_gbq(model_ref_full_name) - assert new_model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek + # TODO(b/340879287): fix type error + assert new_model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek # type: ignore # Assert that model exists in BQ with intended encryption - model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) + # TODO(b/340879287): fix type error + model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) # type: ignore assert model_bq.encryption_configuration.kms_key_name == bq_cmek # Assert that model registration keeps the encryption @@ -274,9 +279,11 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): # https://cloud.google.com/vertex-ai/docs/general/cmek#create_resources_with_the_kms_key. # bigframes.ml does not provide any API for the model deployment. model_registered = new_model.register() + # TODO(b/340879287): fix type error assert ( - model_registered._bqml_model.model.encryption_configuration.kms_key_name + model_registered._bqml_model.model.encryption_configuration.kms_key_name # type: ignore == bq_cmek ) - model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) + # TODO(b/340879287): fix type error + model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) # type: ignore assert model_bq.encryption_configuration.kms_key_name == bq_cmek diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index c419dc4907..58fd346bc1 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -24,17 +24,20 @@ def test_index_construct_from_list(): bf_result = bpd.Index( [3, 14, 159], dtype=pd.Int64Dtype(), name="my_index" ).to_pandas() - pd_result = pd.Index([3, 14, 159], dtype=pd.Int64Dtype(), name="my_index") + # TODO(b/340878489): fix type error + pd_result = pd.Index([3, 14, 159], dtype=pd.Int64Dtype(), name="my_index") # type: ignore pd.testing.assert_index_equal(bf_result, pd_result) def test_index_construct_from_series(): + # TODO(b/340878489): fix type error bf_result = bpd.Index( bpd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), name="index_name", dtype=pd.Int64Dtype(), - ).to_pandas() - pd_result = pd.Index( + ).to_pandas() # type: ignore + # TODO(b/340878489): fix type error + pd_result = pd.Index( # type: ignore pd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), name="index_name", dtype=pd.Int64Dtype(), @@ -46,11 +49,14 @@ def test_index_construct_from_index(): bf_index_input = bpd.Index( [3, 14, 159], dtype=pd.Float64Dtype(), name="series_name" ) + # TODO(b/340878489): fix type error bf_result = bpd.Index( - bf_index_input, dtype=pd.Int64Dtype(), name="index_name" + bf_index_input, dtype=pd.Int64Dtype(), name="index_name" # type: ignore ).to_pandas() - pd_index_input = pd.Index([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name") - pd_result = pd.Index(pd_index_input, dtype=pd.Int64Dtype(), name="index_name") + # TODO(b/340878489): fix type error + pd_index_input = pd.Index([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name") # type: ignore + # TODO(b/340878489): fix type error + pd_result = pd.Index(pd_index_input, dtype=pd.Int64Dtype(), name="index_name") # type: ignore pd.testing.assert_index_equal(bf_result, pd_result) @@ -365,7 +371,8 @@ def test_index_isin(scalars_df_index, scalars_pandas_df_index): pd_result_array = scalars_pandas_df_index.set_index("int64_col").index.isin( [2, 55555, 4] ) - pd.testing.assert_index_equal( + # TODO(b/340878489): fix type error + pd.testing.assert_index_equal( # type: ignore pd.Index(pd_result_array), bf_series, check_names=False, diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 613ad945c1..de631ee20e 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -850,12 +850,14 @@ def test_column_multi_index_stack(level): bf_result = bf_df.stack(level=level).to_pandas() # BigFrames emulates future_stack impl - pd_result = pd_df.stack(level=level, future_stack=True) + # TODO(b/340884387): fix type error + pd_result = pd_df.stack(level=level, future_stack=True) # type: ignore # Pandas produces NaN, where bq dataframes produces pd.NA # Column ordering seems to depend on pandas version + # TODO(b/340884387): fix type error pandas.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bf_result, pd_result, check_dtype=False, check_index_type=False # type: ignore ) @@ -889,9 +891,11 @@ def test_column_multi_index_melt(): def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] - level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") + # TODO(b/340884387): fix type error + level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") # type: ignore # Need resulting column to be pyarrow string rather than object dtype - level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") + # TODO(b/340884387): fix type error + level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") # type: ignore multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) bf_df = scalars_df_index[columns].copy() bf_df.columns = multi_columns @@ -1185,9 +1189,10 @@ def test_explode_w_multi_index(): df = bpd.DataFrame(data, columns=multi_level_columns) pd_df = df.to_pandas() + # TODO(b/340884387): fix type error pandas.testing.assert_frame_equal( df["col0"].explode("col00").to_pandas(), - pd_df["col0"].explode("col00"), + pd_df["col0"].explode("col00"), # type: ignore check_dtype=False, check_index_type=False, ) @@ -1197,7 +1202,8 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "c", "d"]) # Need resulting column to be pyarrow string rather than object dtype - level2 = pandas.Index([None, "b", "b"], dtype="string[pyarrow]") + # TODO(b/340884387): fix type error + level2 = pandas.Index([None, "b", "b"], dtype="string[pyarrow]") # type: ignore multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) bf_df = scalars_df_index[columns].copy() bf_df.columns = multi_columns diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index d543f92655..256046f8b1 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -395,9 +395,11 @@ def test_cut(scalars_dfs): # make sure the result is a supported dtype assert bf_result.dtype == bpd.Int64Dtype() - bf_result = bf_result.to_pandas() + # TODO(b/340884971): fix type error + bf_result = bf_result.to_pandas() # type: ignore pd_result = pd_result.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + # TODO(b/340884971): fix type error + pd.testing.assert_series_equal(bf_result, pd_result) # type: ignore def test_cut_default_labels(scalars_dfs): @@ -528,10 +530,12 @@ def test_qcut(scalars_dfs, q): ) bf_result = bpd.qcut(scalars_df["float64_col"], q, labels=False, duplicates="drop") - bf_result = bf_result.to_pandas() + # TODO(b/340884971): fix type error + bf_result = bf_result.to_pandas() # type: ignore pd_result = pd_result.astype("Int64") - pd.testing.assert_series_equal(bf_result, pd_result) + # TODO(b/340884971): fix type error + pd.testing.assert_series_equal(bf_result, pd_result) # type: ignore @pytest.mark.parametrize( @@ -568,8 +572,9 @@ def test_to_datetime_scalar(arg, utc, unit, format): ], ) def test_to_datetime_iterable(arg, utc, unit, format): + # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(arg, utc=utc, unit=unit, format=format) + bpd.to_datetime(arg, utc=utc, unit=unit, format=format) # type: ignore .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) @@ -584,8 +589,9 @@ def test_to_datetime_iterable(arg, utc, unit, format): def test_to_datetime_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col = "int64_too" + # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") + bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") # type: ignore ) pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s")) pd.testing.assert_series_equal( @@ -608,7 +614,8 @@ def test_to_datetime_series(scalars_dfs): ], ) def test_to_datetime_unit_param(arg, unit): - bf_result = bpd.to_datetime(arg, unit=unit).to_pandas().astype("datetime64[ns]") + # TODO(b/340884971): fix type error + bf_result = bpd.to_datetime(arg, unit=unit).to_pandas().astype("datetime64[ns]") # type: ignore pd_result = pd.Series(pd.to_datetime(arg, unit=unit)).dt.floor("us") pd.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False @@ -625,8 +632,9 @@ def test_to_datetime_unit_param(arg, unit): ], ) def test_to_datetime_format_param(arg, utc, format): + # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(arg, utc=utc, format=format) + bpd.to_datetime(arg, utc=utc, format=format) # type: ignore .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) @@ -634,3 +642,101 @@ def test_to_datetime_format_param(arg, utc, format): pd.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) + + +@pytest.mark.parametrize( + ("arg", "utc", "output_in_utc", "format"), + [ + ( + ["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"], + False, + False, + None, + ), + ( + [ + "2008-12-25 05:30:00Z", + "2008-12-25 05:30:00-00:00", + "2008-12-25 05:30:00+00:00", + "2008-12-25 05:30:00-0000", + "2008-12-25 05:30:00+0000", + "2008-12-25 05:30:00-00", + "2008-12-25 05:30:00+00", + ], + False, + True, + None, + ), + ( + ["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"], + True, + True, + "%Y-%m-%d %H:%M:%S", + ), + ( + [ + "2014-08-15 08:15:12+05:00", + "2011-08-15 08:15:12+05:00", + "2015-08-15 08:15:12+05:00", + ], + True, + True, + None, + ), + ], +) +def test_to_datetime_string_inputs(arg, utc, output_in_utc, format): + # TODO(b/340884971): fix type error + bf_result = ( + bpd.to_datetime(arg, utc=utc, format=format) # type: ignore + .to_pandas() + .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") + ) + pd_result = pd.Series(pd.to_datetime(arg, utc=utc, format=format)).dt.floor("us") + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) + + +@pytest.mark.parametrize( + ("arg", "utc", "output_in_utc"), + [ + ( + [datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)], + False, + False, + ), + ( + [datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)], + True, + True, + ), + ( + [ + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")), + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")), + ], + True, + True, + ), + ( + [ + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York")), + datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")), + ], + True, + True, + ), + ], +) +def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc): + # TODO(b/340884971): fix type error + bf_result = ( + bpd.to_datetime(arg, utc=utc) # type: ignore + .to_pandas() + .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") + ) + pd_result = pd.Series(pd.to_datetime(arg, utc=utc)).dt.floor("us") + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 9c60c821a7..4a39e75ff9 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -623,7 +623,8 @@ def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): indirect_df = bigframes.dataframe.DataFrame(src) indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) - indirect_df = indirect_df.to_pandas() + # TODO(b/340875260): fix type error + indirect_df = indirect_df.to_pandas() # type: ignore assert_pandas_df_equal( direct_df, indirect_df, ignore_order=True, check_index_type=False diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index fa514784c0..dbc8ddec6f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -49,7 +49,8 @@ def test_series_construct_nullable_ints(): [1, 3, bigframes.pandas.NA], index=[0, 4, bigframes.pandas.NA] ).to_pandas() - expected_index = pd.Index( + # TODO(b/340885567): fix type error + expected_index = pd.Index( # type: ignore [0, 4, None], dtype=pd.Int64Dtype(), ) @@ -1429,13 +1430,13 @@ def test_numeric_literal(scalars_dfs): assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) -def test_repr(scalars_dfs): +def test_series_small_repr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" bf_series = scalars_df[col_name] pd_series = scalars_pandas_df[col_name] - assert repr(bf_series) == repr(pd_series) + assert repr(bf_series) == pd_series.to_string(length=False, dtype=True, name=True) def test_sum(scalars_dfs): @@ -2957,7 +2958,8 @@ def test_string_astype_date(): bf_series = series.Series(pd_series) - pd_result = pd_series.astype("date32[day][pyarrow]") + # TODO(b/340885567): fix type error + pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) @@ -3661,9 +3663,10 @@ def test_series_explode_w_index(index, ignore_index): data = [[], [200.0, 23.12], [4.5, -9.0], [1.0]] s = bigframes.pandas.Series(data, index=index) pd_s = pd.Series(data, index=index) + # TODO(b/340885567): fix type error pd.testing.assert_series_equal( - s.explode(ignore_index=ignore_index).to_pandas(), - pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), + s.explode(ignore_index=ignore_index).to_pandas(), # type: ignore + pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), # type: ignore check_index_type=False, ) @@ -3681,12 +3684,14 @@ def test_series_explode_reserve_order(ignore_index, ordered): s = bigframes.pandas.Series(data) pd_s = pd.Series(data) - res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) - pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) + # TODO(b/340885567): fix type error + res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) # type: ignore + # TODO(b/340885567): fix type error + pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) # type: ignore + pd_res.index = pd_res.index.astype(pd.Int64Dtype()) pd.testing.assert_series_equal( res if ordered else res.sort_index(), pd_res, - check_index_type=False, ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 5daa01ad38..2b7c6178ff 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -14,11 +14,12 @@ import io import random +import re import tempfile import textwrap import time import typing -from typing import List, Sequence +from typing import List, Optional, Sequence import google import google.cloud.bigquery as bigquery @@ -68,15 +69,6 @@ def test_read_gbq_tokyo( ["my_strings"], id="one_cols_in_query", ), - pytest.param( - "{scalars_table_id}", - ["unknown"], - marks=pytest.mark.xfail( - raises=ValueError, - reason="Column `unknown` not found in this table.", - ), - id="unknown_col", - ), ], ) def test_read_gbq_w_columns( @@ -91,6 +83,38 @@ def test_read_gbq_w_columns( assert df.columns.tolist() == columns +def test_read_gbq_w_unknown_column( + session: bigframes.Session, + scalars_table_id: str, +): + with pytest.raises( + ValueError, + match=re.escape( + "Column 'int63_col' of `columns` not found in this table. Did you mean 'int64_col'?" + ), + ): + session.read_gbq( + scalars_table_id, + columns=["string_col", "int63_col", "bool_col"], + ) + + +def test_read_gbq_w_unknown_index_col( + session: bigframes.Session, + scalars_table_id: str, +): + with pytest.raises( + ValueError, + match=re.escape( + "Column 'int64_two' of `index_col` not found in this table. Did you mean 'int64_too'?" + ), + ): + session.read_gbq( + scalars_table_id, + index_col=["int64_col", "int64_two"], + ) + + @pytest.mark.parametrize( ("query_or_table", "index_col"), [ @@ -248,6 +272,9 @@ def test_read_gbq_w_primary_keys_table( df = session.read_gbq(f"{table.project}.{table.dataset_id}.{table.table_id}") result = df.head(100).to_pandas() + # Verify that primary keys are used as the index. + assert list(result.index.names) == list(primary_keys) + # Verify that the DataFrame is already sorted by primary keys. sorted_result = result.sort_values(primary_keys) pd.testing.assert_frame_equal(result, sorted_result) @@ -256,6 +283,39 @@ def test_read_gbq_w_primary_keys_table( assert "FOR SYSTEM_TIME AS OF TIMESTAMP" in df.sql +def test_read_gbq_w_primary_keys_table_and_filters( + session: bigframes.Session, usa_names_grouped_table: bigquery.Table +): + """ + Verify fix for internal issue 338039517, where using filters didn't use the + primary keys for indexing / ordering. + """ + # Validate that the table we're querying has a primary key. + table = usa_names_grouped_table + table_constraints = table.table_constraints + assert table_constraints is not None + primary_key = table_constraints.primary_key + assert primary_key is not None + primary_keys = primary_key.columns + assert len(primary_keys) != 0 + + df = session.read_gbq( + f"{table.project}.{table.dataset_id}.{table.table_id}", + filters=[ + ("name", "LIKE", "W%"), + ("total_people", ">", 100), + ], # type: ignore + ) + result = df.to_pandas() + + # Verify that primary keys are used as the index. + assert list(result.index.names) == list(primary_keys) + + # Verify that the DataFrame is already sorted by primary keys. + sorted_result = result.sort_values(primary_keys) + pd.testing.assert_frame_equal(result, sorted_result) + + @pytest.mark.parametrize( ("query_or_table", "max_results"), [ @@ -350,13 +410,14 @@ def test_read_gbq_table_clustered_with_filter(session: bigframes.Session): ["read_gbq", "read_gbq_table"], ) @pytest.mark.parametrize( - ("filters", "table_id", "index_col", "columns"), + ("filters", "table_id", "index_col", "columns", "max_results"), [ pytest.param( [("_table_suffix", ">=", "1930"), ("_table_suffix", "<=", "1939")], _GSOD_ALL_TABLES, ["stn", "wban", "year", "mo", "da"], ["temp", "max", "min"], + 100, id="all", ), pytest.param( @@ -364,6 +425,7 @@ def test_read_gbq_table_clustered_with_filter(session: bigframes.Session): _GSOD_1930S, (), # index_col ["temp", "max", "min"], + None, # max_results id="columns", ), pytest.param( @@ -371,6 +433,7 @@ def test_read_gbq_table_clustered_with_filter(session: bigframes.Session): _GSOD_ALL_TABLES, (), # index_col, (), # columns + None, # max_results id="filters", ), pytest.param( @@ -378,8 +441,17 @@ def test_read_gbq_table_clustered_with_filter(session: bigframes.Session): _GSOD_1930S, ["stn", "wban", "year", "mo", "da"], (), # columns + None, # max_results id="index_col", ), + pytest.param( + (), # filters + _GSOD_1930S, + (), # index_col + (), # columns + 100, # max_results + id="max_results", + ), ], ) def test_read_gbq_wildcard( @@ -389,10 +461,17 @@ def test_read_gbq_wildcard( table_id: str, index_col: Sequence[str], columns: Sequence[str], + max_results: Optional[int], ): table_metadata = session.bqclient.get_table(table_id) method = getattr(session, api_method) - df = method(table_id, filters=filters, index_col=index_col, columns=columns) + df = method( + table_id, + filters=filters, + index_col=index_col, + columns=columns, + max_results=max_results, + ) num_rows, num_columns = df.shape if index_col: @@ -490,7 +569,8 @@ def test_read_pandas(session, scalars_dfs): def test_read_pandas_series(session): - idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + # TODO(b/340887657): fix type error + idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) # type: ignore pd_series = pd.Series([3, 1, 4, 1, 5], dtype=pd.Int64Dtype(), index=idx) bf_series = session.read_pandas(pd_series) @@ -498,7 +578,8 @@ def test_read_pandas_series(session): def test_read_pandas_index(session): - pd_idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + # TODO(b/340887657): fix type error + pd_idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) # type: ignore bf_idx = session.read_pandas(pd_idx) pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) @@ -516,7 +597,8 @@ def test_read_pandas_inline_respects_location(): df = session.read_pandas(pd.DataFrame([[1, 2, 3], [4, 5, 6]])) repr(df) - table = session.bqclient.get_table(df.query_job.destination) + # TODO(b/340887657): fix type error + table = session.bqclient.get_table(df.query_job.destination) # type: ignore assert table.location == "europe-west1" diff --git a/tests/system/utils.py b/tests/system/utils.py index e40502e6f2..ab4c2c119f 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -15,7 +15,7 @@ import base64 import decimal import functools -from typing import Iterable, Optional, Set +from typing import Iterable, Optional, Set, Union import geopandas as gpd # type: ignore import google.api_core.operation @@ -28,6 +28,23 @@ from bigframes.functions import remote_function +ML_REGRESSION_METRICS = [ + "mean_absolute_error", + "mean_squared_error", + "mean_squared_log_error", + "median_absolute_error", + "r2_score", + "explained_variance", +] +ML_CLASSFICATION_METRICS = [ + "precision", + "recall", + "accuracy", + "f1_score", + "log_loss", + "roc_auc", +] + def skip_legacy_pandas(test): @functools.wraps(test) @@ -249,6 +266,33 @@ def assert_pandas_df_equal_pca(actual, expected, **kwargs): pd.testing.assert_series_equal(-actual[column], expected[column], **kwargs) +def check_pandas_df_schema_and_index( + pd_df: pd.DataFrame, + columns: Iterable, + index: Union[int, Iterable], + col_exact: bool = True, +): + """Check pandas df schema and index. But not the values. + + Args: + pd_df: the input pandas df + columns: target columns to check with + index: int or Iterable. If int, only check the length (index size) of the df. If Iterable, check index values match + col_exact: If True, check the columns param are exact match. Otherwise only check the df contains all of those columns + """ + if col_exact: + assert list(pd_df.columns) == list(columns) + else: + assert set(columns) <= set(pd_df.columns) + + if isinstance(index, int): + assert len(pd_df) == index + elif isinstance(index, Iterable): + assert list(pd_df.index) == list(index) + else: + raise ValueError("Unsupported index type.") + + def get_remote_function_endpoints( bigquery_client: bigquery.Client, dataset_id: str ) -> Set[str]: diff --git a/tests/unit/ml/test_api_primitives.py b/tests/unit/ml/test_api_primitives.py index da77a180a8..00a51ccfe9 100644 --- a/tests/unit/ml/test_api_primitives.py +++ b/tests/unit/ml/test_api_primitives.py @@ -30,8 +30,9 @@ def test_base_estimator_repr(): estimator = bigframes.ml.linear_model.LinearRegression(fit_intercept=True) assert estimator.__repr__() == "LinearRegression()" - estimator = bigframes.ml.decomposition.PCA(n_components=7) - assert estimator.__repr__() == "PCA(n_components=7)" + # TODO(b/340891292): fix type error + pca_estimator = bigframes.ml.decomposition.PCA(n_components=7) + assert pca_estimator.__repr__() == "PCA(n_components=7)" @pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn") @@ -48,6 +49,7 @@ def test_base_estimator_repr_matches_sklearn(): sklearn_estimator = sklearn_linear_model.LinearRegression(fit_intercept=True) assert estimator.__repr__() == sklearn_estimator.__repr__() - estimator = bigframes.ml.decomposition.PCA(n_components=7) + # TODO(b/340891292): fix type error + pca_estimator = bigframes.ml.decomposition.PCA(n_components=7) sklearn_estimator = sklearn_decomposition.PCA(n_components=7) - assert estimator.__repr__() == sklearn_estimator.__repr__() + assert pca_estimator.__repr__() == sklearn_estimator.__repr__() diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 5a3470e883..8ba13a7276 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -13,6 +13,7 @@ # limitations under the License. import datetime +import re from typing import Iterable import google.cloud.bigquery as bigquery @@ -205,19 +206,16 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) @pytest.mark.parametrize( - ("query_or_table", "index_cols", "columns", "filters", "expected_output"), + ( + "query_or_table", + "index_cols", + "columns", + "filters", + "max_results", + "time_travel_timestamp", + "expected_output", + ), [ - pytest.param( - "test_table", - [], - [], - ["date_col", ">", "2022-10-20"], - None, - marks=pytest.mark.xfail( - raises=ValueError, - ), - id="raise_error", - ), pytest.param( "test_table", ["row_index"], @@ -226,30 +224,42 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) (("rowindex", "not in", [0, 6]),), (("string_col", "in", ["Hello, World!", "こんにちは"]),), ], + 123, # max_results, + datetime.datetime(2024, 5, 14, 12, 42, 36, 125125), ( - "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE " - "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " - "'こんにちは')" + "SELECT `row_index`, `string_col` FROM `test_table` " + "FOR SYSTEM_TIME AS OF TIMESTAMP('2024-05-14T12:42:36.125125') " + "WHERE `rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " + "'こんにちは') LIMIT 123" ), id="table-all_params-filter_or_operation", ), pytest.param( - """SELECT - rowindex, - string_col, - FROM `test_table` AS t - """, + ( + """SELECT + rowindex, + string_col, + FROM `test_table` AS t + """ + ), ["rowindex"], ["string_col"], [ ("rowindex", "<", 4), ("string_col", "==", "Hello, World!"), ], - """SELECT `rowindex`, `string_col` FROM (SELECT - rowindex, - string_col, - FROM `test_table` AS t - ) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""", + 123, # max_results, + datetime.datetime(2024, 5, 14, 12, 42, 36, 125125), + ( + """SELECT `rowindex`, `string_col` FROM (SELECT + rowindex, + string_col, + FROM `test_table` AS t + ) """ + "FOR SYSTEM_TIME AS OF TIMESTAMP('2024-05-14T12:42:36.125125') " + "WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!' " + "LIMIT 123" + ), id="subquery-all_params-filter_and_operation", ), pytest.param( @@ -257,7 +267,9 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], ["col_a", "col_b"], [], - "SELECT `col_a`, `col_b` FROM `test_table` AS sub", + None, # max_results + None, # time_travel_timestampe + "SELECT `col_a`, `col_b` FROM `test_table`", id="table-columns", ), pytest.param( @@ -265,7 +277,9 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], [], [("date_col", ">", "2022-10-20")], - "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'", + None, # max_results + None, # time_travel_timestampe + "SELECT * FROM `test_table` WHERE `date_col` > '2022-10-20'", id="table-filter", ), pytest.param( @@ -273,7 +287,9 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], [], [], - "SELECT * FROM `test_table*` AS sub", + None, # max_results + None, # time_travel_timestampe + "SELECT * FROM `test_table*`", id="wildcard-no_params", ), pytest.param( @@ -281,30 +297,49 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], [], [("_TABLE_SUFFIX", ">", "2022-10-20")], - "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'", + None, # max_results + None, # time_travel_timestampe + "SELECT * FROM `test_table*` WHERE `_TABLE_SUFFIX` > '2022-10-20'", id="wildcard-filter", ), ], ) -def test_to_query(query_or_table, index_cols, columns, filters, expected_output): +def test_to_query( + query_or_table, + index_cols, + columns, + filters, + max_results, + time_travel_timestamp, + expected_output, +): query = io_bq.to_query( query_or_table, - index_cols, - columns, - filters, + index_cols=index_cols, + columns=columns, + filters=filters, + max_results=max_results, + time_travel_timestamp=time_travel_timestamp, ) assert query == expected_output @pytest.mark.parametrize( - ("query_or_table", "filters", "expected_output"), - [], + ("filters", "expected_message"), + ( + pytest.param( + ["date_col", ">", "2022-10-20"], + "Elements of filters must be tuples of length 3, but got 'd'", + ), + ), ) -def test_to_query_with_wildcard_table(query_or_table, filters, expected_output): - query = io_bq.to_query( - query_or_table, - (), # index_cols - (), # columns - filters, - ) - assert query == expected_output +def test_to_query_fails_with_bad_filters(filters, expected_message): + with pytest.raises(ValueError, match=re.escape(expected_message)): + io_bq.to_query( + "test_table", + index_cols=(), + columns=(), + filters=filters, + max_results=None, + time_travel_timestamp=None, + ) diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 1d09769aec..52c86cd1e4 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -15,23 +15,104 @@ """Unit tests for read_gbq_table helper functions.""" import datetime +import unittest.mock as mock +import google.cloud.bigquery import google.cloud.bigquery as bigquery +import pytest import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table +from .. import resources + + +def test_get_ibis_time_travel_table_doesnt_timetravel_anonymous_datasets(): + bqsession = resources.create_bigquery_session() -def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): table_ref = bigquery.TableReference.from_string( "my-test-project._e8166e0cdb.anonbb92cd" ) - sql = bf_read_gbq_table._create_time_travel_sql( - table_ref, datetime.datetime.now(datetime.timezone.utc) + table_expression = bf_read_gbq_table.get_ibis_time_travel_table( + bqsession.ibis_client, + table_ref, + index_cols=(), + columns=(), + filters=(), + time_travel_timestamp=datetime.datetime.now(datetime.timezone.utc), ) + sql = table_expression.compile() # Anonymous query results tables don't support time travel. assert "SYSTEM_TIME" not in sql # Need fully-qualified table name. - assert "`my-test-project`.`_e8166e0cdb`.`anonbb92cd`" in sql + assert "my-test-project" in sql + + +@pytest.mark.parametrize( + ("index_cols", "primary_keys", "values_distinct", "expected"), + ( + (["col1", "col2"], ["col1", "col2", "col3"], False, False), + (["col1", "col2", "col3"], ["col1", "col2", "col3"], True, True), + ( + ["col2", "col3", "col1"], + [ + "col3", + "col2", + ], + True, + True, + ), + (["col1", "col2"], [], False, False), + ([], ["col1", "col2", "col3"], False, False), + ([], [], False, False), + ), +) +def test_are_index_cols_unique(index_cols, primary_keys, values_distinct, expected): + """If a primary key is set on the table, we use that as the index column + by default, no error should be raised in this case. + + See internal issue 335727141. + """ + table = google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "my_table", + }, + "clustering": { + "fields": ["col1", "col2"], + }, + }, + ) + table.schema = ( + google.cloud.bigquery.SchemaField("col1", "INT64"), + google.cloud.bigquery.SchemaField("col2", "INT64"), + google.cloud.bigquery.SchemaField("col3", "INT64"), + google.cloud.bigquery.SchemaField("col4", "INT64"), + ) + + # TODO(b/305264153): use setter for table_constraints in client library + # when available. + table._properties["tableConstraints"] = { + "primaryKey": { + "columns": primary_keys, + }, + } + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.return_value = table + + bqclient.query_and_wait.return_value = ( + {"total_count": 3, "distinct_count": 3 if values_distinct else 2}, + ) + session = resources.create_bigquery_session( + bqclient=bqclient, table_schema=table.schema + ) + table._properties["location"] = session._location + + result = bf_read_gbq_table.are_index_cols_unique(bqclient, table, index_cols, "") + + assert result == expected diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index bea858e037..4f5daebc87 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -179,6 +179,9 @@ def get_table_mock(table_ref): return table session.bqclient.get_table = get_table_mock + session.bqclient.query_and_wait.return_value = ( + {"total_count": 3, "distinct_count": 2}, + ) with pytest.warns(UserWarning, match=re.escape("use_cache=False")): df = session.read_gbq("my-project.my_dataset.my_table") @@ -200,6 +203,7 @@ def test_default_index_warning_raised_by_read_gbq(table): bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table + bqclient.query_and_wait.return_value = ({"total_count": 3, "distinct_count": 2},) session = resources.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location @@ -222,6 +226,7 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table + bqclient.query_and_wait.return_value = ({"total_count": 4, "distinct_count": 3},) session = resources.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 535b748345..70639315be 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -95,7 +95,8 @@ def test_cut_raises_with_labels(): match="The 'labels' parameter must be either False or None.", ): mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) - bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) + # TODO(b/340893280): fix type error + bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) # type: ignore @pytest.mark.parametrize( diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index fddeab19a2..ecef2115e5 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -32,19 +32,41 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): return f"GENERATE_ARRAY(0, {arg})" +def _safe_cast_to_datetime(translator, op: vendored_ibis_ops.SafeCastToDatetime): + arg = translator.translate(op.arg) + return f"SAFE_CAST({arg} AS DATETIME)" + + def _quantile(translator, op: ibis_reductions.Quantile): arg = translator.translate(op.arg) quantile = translator.translate(op.quantile) return f"PERCENTILE_CONT({arg}, {quantile})" +def _array_aggregate(translator, op: vendored_ibis_ops.ArrayAggregate): + """This method provides the same functionality as the collect() method in Ibis, with + the added capability of ordering the results using order_by. + https://github.com/ibis-project/ibis/issues/9170 + """ + arg = translator.translate(op.arg) + + order_by_sql = "" + if len(op.order_by) > 0: + order_by = ", ".join([translator.translate(column) for column in op.order_by]) + order_by_sql = f"ORDER BY {order_by}" + + return f"ARRAY_AGG({arg} IGNORE NULLS {order_by_sql})" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore + vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore ibis_reductions.Quantile: _quantile, # type:ignore + vendored_ibis_ops.ArrayAggregate: _array_aggregate, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index 3d5a5a7fa0..3ae5fc10e4 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -2,6 +2,6 @@ from __future__ import annotations from bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 -from bigframes_vendored.ibis.expr.operations.generic import * # noqa: F401 F403 +from bigframes_vendored.ibis.expr.operations.arrays import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/arrays.py b/third_party/bigframes_vendored/ibis/expr/operations/arrays.py new file mode 100644 index 0000000000..a0ad915a9b --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/arrays.py @@ -0,0 +1,18 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/arrays.py +from __future__ import annotations + +import ibis.expr.datatypes as dt +from ibis.expr.operations.core import Unary + + +class GenerateArray(Unary): + """ + Generates an array of values, similar to ibis.range(), but with simpler and + more efficient SQL generation. + """ + + dtype = dt.Array(dt.int64) + + +class SafeCastToDatetime(Unary): + dtype = dt.Timestamp(timezone=None) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/generic.py b/third_party/bigframes_vendored/ibis/expr/operations/generic.py deleted file mode 100644 index 82d0a13371..0000000000 --- a/third_party/bigframes_vendored/ibis/expr/operations/generic.py +++ /dev/null @@ -1,9 +0,0 @@ -# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/generic.py -from __future__ import annotations - -import ibis.expr.datatypes as dt -from ibis.expr.operations.core import Unary - - -class GenerateArray(Unary): - dtype = dt.Array(dt.int64) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index e6644f477a..bd971e408a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -2,6 +2,8 @@ from __future__ import annotations +import ibis.common.annotations as ibis_annotations +from ibis.common.typing import VarTuple import ibis.expr.datatypes as dt import ibis.expr.operations.core as ibis_ops_core from ibis.expr.operations.reductions import Filterable, Reduction @@ -18,6 +20,18 @@ class ApproximateMultiQuantile(Filterable, Reduction): dtype = dt.Array(dt.float64) -__all__ = [ - "ApproximateMultiQuantile", -] +class ArrayAggregate(Filterable, Reduction): + """ + Collects the elements of this expression into an ordered array. Similar to + the ibis `ArrayCollect`, but adds `order_by_*` and `distinct_only` parameters. + """ + + arg: ibis_ops_core.Column + order_by: VarTuple[ibis_ops_core.Value] = () + + @ibis_annotations.attribute + def dtype(self): + return dt.Array(self.arg.dtype) + + +__all__ = ["ApproximateMultiQuantile", "ArrayAggregate"] diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 38ea208eaf..47a6013c4c 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -96,15 +96,15 @@ def read_gbq( Reading data with `columns` and `filters` parameters: >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] - >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])] + >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant']), ('pitchSpeed', '>', 94)] >>> df = bpd.read_gbq( ... "bigquery-public-data.baseball.games_wide", ... columns=columns, ... filters=filters, ... ) >>> df.head(1) - pitcherFirstName pitcherLastName year pitchSpeed - 0 John Gant 2016 82 + pitcherFirstName pitcherLastName year pitchSpeed + 0 John Gant 2016 95 [1 rows x 4 columns]